def convert_to_labeled_review(hpv_version, space_separated_words, d2v_id,
                              best_topic, second_best_topic):
    d2v_identifier = get_d2v_identifier(d2v_id)
    space_separated_words = space_separated_words.lower()
    if not hpv_version:
        return no_hpv(d2v_identifier, space_separated_words)
    elif hpv_version == 1:
        return hpv_with_par_sent_subsent(d2v_identifier, space_separated_words)
    elif hpv_version == 2:
        return hpv_with_par_sent(d2v_identifier, space_separated_words)
    elif hpv_version == 3:
        return hpv_with_par_sent_subsentnv(d2v_identifier,
                                           space_separated_words)
    elif hpv_version == 4:
        return hpv_with_topics(d2v_identifier, space_separated_words,
                               best_topic, second_best_topic)
    elif hpv_version == 5:
        return hpv_with_par_sent_and_topics(d2v_identifier,
                                            space_separated_words, best_topic,
                                            second_best_topic)
    elif hpv_version == 6:
        return hpv_with_par(d2v_identifier, space_separated_words)
    elif hpv_version == 7:
        return hpv_with_par_and_topics(d2v_identifier, space_separated_words,
                                       best_topic, second_best_topic)
示例#2
0
def convert_to_vector(d2v_id, model_dm, model_dbow):
    d2v_identifier = get_d2v_identifier(d2v_id)
    a = extract_vector_from_model(d2v_identifier, model_dm)
    b = extract_vector_from_model(d2v_identifier, model_dbow)
    if a is None or b is None:
        raise 'a or b is none! {}'.format(d2v_identifier)

    return numpy.concatenate([a, b])
    def classify(self,
                 model,
                 y_test_reviews,
                 reviews,
                 nearest_reviews_count=2000):
        y_test = np.zeros(len(y_test_reviews))

        i = 0
        for d2v_numeric_id in y_test_reviews['d2v_id']:
            if i % 100 == 0:
                log_to_info('Processing {0} of {1} ({2}%)'.format(
                    i, len(y_test_reviews),
                    1.0 * i / len(y_test_reviews) * 100.0))
            d2v_id = get_d2v_identifier(d2v_numeric_id)
            arr = model.most_similar(d2v_id, topn=10000)
            sentiment_sum = 0.0
            total_neareness = 0.0
            total_sentiments = 0
            for key in arr:
                if key[0].startswith('REVIEW_'):
                    most_similar_review = key[0]
                    most_similar_id = int(most_similar_review.split('_')[1])
                    r = reviews[reviews['d2v_id'].eq(most_similar_id)]
                    if r['use_for_classifier_training'].all():
                        sentiment = r['sentiment'].values[0]
                        nearness = key[1]
                        sentiment_sum += sentiment * nearness
                        total_neareness += nearness
                        total_sentiments += 1
                        if total_sentiments >= nearest_reviews_count:
                            break
            # log_to_info('{0} predicts {1}'.format(d2v_id, sentiment))

            if total_neareness == 0:
                log_to_info('key {0} has no similar review!'.format(d2v_id))
                y_test[i] = 0
            else:
                sentiment = 0 if sentiment_sum <= total_neareness * 0.5 else 1
                y_test[i] = sentiment

            i += 1

        return y_test
def convert_to_labeled_review(hpv_version, space_separated_words, d2v_id, best_topic, second_best_topic):
    d2v_identifier = get_d2v_identifier(d2v_id)
    space_separated_words = space_separated_words.lower()
    if not hpv_version:
        return no_hpv(d2v_identifier, space_separated_words)
    elif hpv_version == 1:
        return hpv_with_par_sent_subsent(d2v_identifier, space_separated_words)
    elif hpv_version == 2:
        return hpv_with_par_sent(d2v_identifier, space_separated_words)
    elif hpv_version == 3:
        return hpv_with_par_sent_subsentnv(d2v_identifier, space_separated_words)
    elif hpv_version == 4:
        return hpv_with_topics(d2v_identifier, space_separated_words, best_topic, second_best_topic)
    elif hpv_version == 5:
        return hpv_with_par_sent_and_topics(d2v_identifier, space_separated_words, best_topic, second_best_topic)
    elif hpv_version == 6:
        return hpv_with_par(d2v_identifier, space_separated_words)
    elif hpv_version == 7:
        return hpv_with_par_and_topics(d2v_identifier, space_separated_words, best_topic, second_best_topic)
    def classify(self, model, y_test_reviews, reviews, nearest_reviews_count=2000):
        y_test = np.zeros(len(y_test_reviews))

        i = 0
        for d2v_numeric_id in y_test_reviews["d2v_id"]:
            if i % 100 == 0:
                log_to_info(
                    "Processing {0} of {1} ({2}%)".format(i, len(y_test_reviews), 1.0 * i / len(y_test_reviews) * 100.0)
                )
            d2v_id = get_d2v_identifier(d2v_numeric_id)
            arr = model.most_similar(d2v_id, topn=10000)
            sentiment_sum = 0.0
            total_neareness = 0.0
            total_sentiments = 0
            for key in arr:
                if key[0].startswith("REVIEW_"):
                    most_similar_review = key[0]
                    most_similar_id = int(most_similar_review.split("_")[1])
                    r = reviews[reviews["d2v_id"].eq(most_similar_id)]
                    if r["use_for_classifier_training"].all():
                        sentiment = r["sentiment"].values[0]
                        nearness = key[1]
                        sentiment_sum += sentiment * nearness
                        total_neareness += nearness
                        total_sentiments += 1
                        if total_sentiments >= nearest_reviews_count:
                            break
            # log_to_info('{0} predicts {1}'.format(d2v_id, sentiment))

            if total_neareness == 0:
                log_to_info("key {0} has no similar review!".format(d2v_id))
                y_test[i] = 0
            else:
                sentiment = 0 if sentiment_sum <= total_neareness * 0.5 else 1
                y_test[i] = sentiment

            i += 1

        return y_test