def convert_to_labeled_review(hpv_version, space_separated_words, d2v_id, best_topic, second_best_topic): d2v_identifier = get_d2v_identifier(d2v_id) space_separated_words = space_separated_words.lower() if not hpv_version: return no_hpv(d2v_identifier, space_separated_words) elif hpv_version == 1: return hpv_with_par_sent_subsent(d2v_identifier, space_separated_words) elif hpv_version == 2: return hpv_with_par_sent(d2v_identifier, space_separated_words) elif hpv_version == 3: return hpv_with_par_sent_subsentnv(d2v_identifier, space_separated_words) elif hpv_version == 4: return hpv_with_topics(d2v_identifier, space_separated_words, best_topic, second_best_topic) elif hpv_version == 5: return hpv_with_par_sent_and_topics(d2v_identifier, space_separated_words, best_topic, second_best_topic) elif hpv_version == 6: return hpv_with_par(d2v_identifier, space_separated_words) elif hpv_version == 7: return hpv_with_par_and_topics(d2v_identifier, space_separated_words, best_topic, second_best_topic)
def convert_to_vector(d2v_id, model_dm, model_dbow): d2v_identifier = get_d2v_identifier(d2v_id) a = extract_vector_from_model(d2v_identifier, model_dm) b = extract_vector_from_model(d2v_identifier, model_dbow) if a is None or b is None: raise 'a or b is none! {}'.format(d2v_identifier) return numpy.concatenate([a, b])
def classify(self, model, y_test_reviews, reviews, nearest_reviews_count=2000): y_test = np.zeros(len(y_test_reviews)) i = 0 for d2v_numeric_id in y_test_reviews['d2v_id']: if i % 100 == 0: log_to_info('Processing {0} of {1} ({2}%)'.format( i, len(y_test_reviews), 1.0 * i / len(y_test_reviews) * 100.0)) d2v_id = get_d2v_identifier(d2v_numeric_id) arr = model.most_similar(d2v_id, topn=10000) sentiment_sum = 0.0 total_neareness = 0.0 total_sentiments = 0 for key in arr: if key[0].startswith('REVIEW_'): most_similar_review = key[0] most_similar_id = int(most_similar_review.split('_')[1]) r = reviews[reviews['d2v_id'].eq(most_similar_id)] if r['use_for_classifier_training'].all(): sentiment = r['sentiment'].values[0] nearness = key[1] sentiment_sum += sentiment * nearness total_neareness += nearness total_sentiments += 1 if total_sentiments >= nearest_reviews_count: break # log_to_info('{0} predicts {1}'.format(d2v_id, sentiment)) if total_neareness == 0: log_to_info('key {0} has no similar review!'.format(d2v_id)) y_test[i] = 0 else: sentiment = 0 if sentiment_sum <= total_neareness * 0.5 else 1 y_test[i] = sentiment i += 1 return y_test
def classify(self, model, y_test_reviews, reviews, nearest_reviews_count=2000): y_test = np.zeros(len(y_test_reviews)) i = 0 for d2v_numeric_id in y_test_reviews["d2v_id"]: if i % 100 == 0: log_to_info( "Processing {0} of {1} ({2}%)".format(i, len(y_test_reviews), 1.0 * i / len(y_test_reviews) * 100.0) ) d2v_id = get_d2v_identifier(d2v_numeric_id) arr = model.most_similar(d2v_id, topn=10000) sentiment_sum = 0.0 total_neareness = 0.0 total_sentiments = 0 for key in arr: if key[0].startswith("REVIEW_"): most_similar_review = key[0] most_similar_id = int(most_similar_review.split("_")[1]) r = reviews[reviews["d2v_id"].eq(most_similar_id)] if r["use_for_classifier_training"].all(): sentiment = r["sentiment"].values[0] nearness = key[1] sentiment_sum += sentiment * nearness total_neareness += nearness total_sentiments += 1 if total_sentiments >= nearest_reviews_count: break # log_to_info('{0} predicts {1}'.format(d2v_id, sentiment)) if total_neareness == 0: log_to_info("key {0} has no similar review!".format(d2v_id)) y_test[i] = 0 else: sentiment = 0 if sentiment_sum <= total_neareness * 0.5 else 1 y_test[i] = sentiment i += 1 return y_test