Пример #1
0
def split_data_by_business(train_ratio_of_total=0.5):
    """ Splits the data such that all reviews of a particular business end up in either the training set or the test set.  This prevents links between reviews from being lost during the split. """
    reviews = readyelp.read_reviews_to_dict("./reviews.json")
    users = readyelp.read_users_to_dict("./users.json")

    businesses = business_reviews_dict(reviews)

    train_ids = []
    test_ids = []

    for business_id in businesses:
        business_reviews = businesses[business_id]
        assignment = random.random()
        if assignment <= train_ratio_of_total:
            train_ids.extend(business_reviews)
        else:
            test_ids.extend(business_reviews)

    train = []
    test = []

    for train_id in train_ids:
        review = reviews[train_id]
        train.append(review)

    for test_id in test_ids:
        review = reviews[test_id]
        test.append(review)

    readyelp.write_output(train, "./train_reviews.json")
    readyelp.write_output(test, "./test_reviews.json")
Пример #2
0
def split_data_by_business(train_ratio_of_total = 0.5):
    """ Splits the data such that all reviews of a particular business end up in either the training set or the test set.  This prevents links between reviews from being lost during the split. """
    reviews = readyelp.read_reviews_to_dict("./reviews.json")
    users = readyelp.read_users_to_dict("./users.json")

    businesses = business_reviews_dict(reviews)

    train_ids = []
    test_ids = []

    for business_id in businesses:
        business_reviews = businesses[business_id]
        assignment = random.random()
        if assignment <= train_ratio_of_total:
            train_ids.extend(business_reviews)
        else:
            test_ids.extend(business_reviews)

    train = []
    test = []

    for train_id in train_ids:
        review = reviews[train_id]
        train.append(review)

    for test_id in test_ids:
        review = reviews[test_id]
        test.append(review)

    readyelp.write_output(train, "./train_reviews.json")
    readyelp.write_output(test, "./test_reviews.json")
Пример #3
0
def main():
    """ Splits the review data in reviews.json into training and testing data sets.  Reviews created on or before split_date are placed in the training set and reviews created afterward are placed in the test set. """

    users = readyelp.read_users_to_dict("./users.json")
    reviews = readyelp.read_reviews_to_dict("./reviews.json")
    cleanyelp.clean_review_dict(reviews, users)

    split_date = cleanyelp.median_date(reviews)

    train = []
    test = []

    for review_id in reviews:
        review = reviews[review_id]
        if len(review["friend_reviews_of_business"]) > 0:
            assignment = random.random()
            if assignment <= 0.5:
                test.append(review)
            else:
                train.append(review)
        else:
            train.append(review)
        # review_date = reviews[review_id]["date"]
        # if review_date <= split_date:
        #     train.append(reviews[review_id])
        # else:
        #     test.append(reviews[review_id])

    readyelp.write_output(train, "./train_reviews.json")
    readyelp.write_output(test, "./test_reviews.json")
Пример #4
0
def main():
    train_reviews = readyelp.read_reviews_to_dict("./train_reviews.json")
    test_reviews = readyelp.read_reviews_to_dict("./test_reviews.json")
    user_dict = readyelp.read_users_to_dict("./users_limited.json")
    klass_list = ["negative", "positive"]
    test_corpus = []
    gold_labels = []
    Y_random = []
    for review_id in test_reviews:
        review = test_reviews[review_id]
        test_corpus.append(review["text"])
        gold_labels.append(review["rating"])
        Y_random.append(random_class())

    print "Random model metrics:"
    print metrics.classification_report(gold_labels,
                                        Y_random,
                                        target_names=klass_list)

    Y_bag_of_words = bag_of_words_baseline(train_reviews, test_reviews)
    print "Bag of words baseline model metrics:"
    print metrics.classification_report(gold_labels,
                                        Y_bag_of_words,
                                        target_names=klass_list)

    Y_random_influence = []
    Y_bow_influence = []
    Y_influence = influence_baseline(train_reviews, test_reviews, user_dict)
    for i in range(len(Y_influence)):
        if Y_influence[i] == "UNKNOWN":
            Y_bow_influence.append(Y_bag_of_words[i])
            Y_random_influence.append(Y_random[i])
        else:
            Y_bow_influence.append(Y_influence[i])
            Y_random_influence.append(Y_influence[i])

    print "Random influence baseline model metrics:"
    print metrics.classification_report(gold_labels,
                                        Y_random_influence,
                                        target_names=klass_list)

    print "Bag-of-words influence baseline model metrics:"
    print metrics.classification_report(gold_labels,
                                        Y_bow_influence,
                                        target_names=klass_list)
Пример #5
0
def filter_users():
    """ Removes from the set of users any users that do not have reviews in either the training or test datasets. """
    user_dict = readyelp.read_users_to_dict("./users.json")
    train_reviews = readyelp.read_reviews_to_dict("./train_reviews.json")
    test_reviews = readyelp.read_reviews_to_dict("./test_reviews.json")

    users_limited = []

    for user_id in user_dict:
        user = user_dict[user_id]
        user_review_list = user["reviews"]
        for review_id in user_review_list:
            if review_id not in train_reviews and review_id not in test_reviews:
                user_review_list.remove(review_id)
        if len(user_review_list) > 0:
            user["reviews"] = user_review_list
            users_limited.append(user)

    readyelp.write_output(users_limited, "./users.json")
Пример #6
0
def filter_users():
    """ Removes from the set of users any users that do not have reviews in either the training or test datasets. """
    user_dict = readyelp.read_users_to_dict("./users.json")
    train_reviews = readyelp.read_reviews_to_dict("./train_reviews.json")
    test_reviews = readyelp.read_reviews_to_dict("./test_reviews.json")

    users_limited = []

    for user_id in user_dict:
        user = user_dict[user_id]
        user_review_list = user["reviews"]
        for review_id in user_review_list:
            if review_id not in train_reviews and review_id not in test_reviews:
                user_review_list.remove(review_id)
        if len(user_review_list) > 0:
            user["reviews"] = user_review_list
            users_limited.append(user)

    readyelp.write_output(users_limited, "./users.json")
Пример #7
0
def main():

    ## Only call the below once, when data needs to be cleaned and split ##
    cleanyelp.split_data_by_business(0.75)
    #######################################################################

    train_reviews = readyelp.read_reviews_to_dict("./train_reviews.json")
    test_reviews = readyelp.read_reviews_to_dict("./test_reviews.json")
    user_dict = readyelp.read_users_to_dict("./users_limited.json")
    klass_list = ["negative", "positive"]

    # Calculate class preferences of individual classifier
    ind_pref = baselineclassifier.bag_of_words_probabilities(
        train_reviews, test_reviews)
    print "Individual preferences calculated."
    # Train CRF model
    reviewcrf.train_crf(train_reviews, user_dict)
    # Calculate pair strengths
    pair_str = reviewcrf.crftag_probabilities(test_reviews, user_dict)
    print "Pair strengths calculated."
    # Build review graph
    min_cut_classes = reviewgraph.build_graph(klass_list, test_reviews,
                                              ind_pref, pair_str)
    print "Graph cut."
    # Make min-cut classification
    Y_gold = []
    Y_predict = []
    for test_id in test_reviews:
        review = test_reviews[test_id]
        Y_gold.append(review["rating"])
        if min_cut_classes[test_id] == 1:
            Y_predict.append("positive")
        else:
            Y_predict.append("negative")

    classification_metrics = metrics.classification_report(
        Y_gold, Y_predict, target_names=klass_list)

    print classification_metrics
def main():
    train_reviews = readyelp.read_reviews_to_dict("./train_reviews.json")
    test_reviews = readyelp.read_reviews_to_dict("./test_reviews.json")
    user_dict = readyelp.read_users_to_dict("./users_limited.json")
    klass_list = ["negative", "positive"]
    test_corpus = []
    gold_labels = []
    Y_random = []
    for review_id in test_reviews:
        review = test_reviews[review_id]
        test_corpus.append(review["text"])
        gold_labels.append(review["rating"])
        Y_random.append(random_class())

    print "Random model metrics:"
    print metrics.classification_report(gold_labels, Y_random, target_names = klass_list)

    Y_bag_of_words = bag_of_words_baseline(train_reviews, test_reviews)
    print "Bag of words baseline model metrics:"
    print metrics.classification_report(gold_labels, Y_bag_of_words, target_names = klass_list)

    Y_random_influence = []
    Y_bow_influence = []
    Y_influence = influence_baseline(train_reviews, test_reviews, user_dict)
    for i in range(len(Y_influence)):
        if Y_influence[i] == "UNKNOWN":
            Y_bow_influence.append(Y_bag_of_words[i])
            Y_random_influence.append(Y_random[i])
        else:
            Y_bow_influence.append(Y_influence[i])
            Y_random_influence.append(Y_influence[i])

    print "Random influence baseline model metrics:"
    print metrics.classification_report(gold_labels, Y_random_influence, target_names = klass_list)

    print "Bag-of-words influence baseline model metrics:"
    print metrics.classification_report(gold_labels, Y_bow_influence, target_names = klass_list)
def main():

    ## Only call the below once, when data needs to be cleaned and split ##
    cleanyelp.split_data_by_business(0.75)
    #######################################################################

    train_reviews = readyelp.read_reviews_to_dict("./train_reviews.json")
    test_reviews = readyelp.read_reviews_to_dict("./test_reviews.json")
    user_dict = readyelp.read_users_to_dict("./users_limited.json")
    klass_list = ["negative", "positive"]

    # Calculate class preferences of individual classifier
    ind_pref = baselineclassifier.bag_of_words_probabilities(train_reviews, test_reviews)
    print "Individual preferences calculated."
    # Train CRF model
    reviewcrf.train_crf(train_reviews, user_dict)
    # Calculate pair strengths
    pair_str = reviewcrf.crftag_probabilities(test_reviews, user_dict)
    print "Pair strengths calculated."
    # Build review graph
    min_cut_classes = reviewgraph.build_graph(klass_list, test_reviews, ind_pref, pair_str)
    print "Graph cut."
    # Make min-cut classification
    Y_gold = []
    Y_predict = []
    for test_id in test_reviews:
        review = test_reviews[test_id]
        Y_gold.append(review["rating"])
        if min_cut_classes[test_id] == 1:
            Y_predict.append("positive")
        else:
            Y_predict.append("negative")

    classification_metrics = metrics.classification_report(Y_gold, Y_predict, target_names = klass_list)

    print classification_metrics
Пример #10
0
def main():
    """ Invoking cleanyelp.py will output basic statistics from the yelp data. """
    user_dict = readyelp.read_users_to_dict("./users.json")
    print "Total number of users with friends:", len(user_dict)

    review_dict = readyelp.read_reviews_to_dict("./train_reviews.json")
    # clean_review_dict(review_dict, user_dict)
    print "Total number of reviews from these users:", len(review_dict)

    common_review_pairs = find_review_pairs_by_friends(user_dict, review_dict)
    print "Total number of friend review pairs of the same business:", len(
        common_review_pairs)

    klass_list = ["negative", "positive"]
    raw_counts = klass_counts(review_dict, klass_list)
    for klass in klass_list:
        print "Total reviews with " + klass + " sentiment:", raw_counts[klass]
    _homophily_counts(common_review_pairs, review_dict, klass_list)
Пример #11
0
def main():
    """ Invoking cleanyelp.py will output basic statistics from the yelp data. """
    user_dict = readyelp.read_users_to_dict("./users.json")
    print "Total number of users with friends:", len(user_dict)

    review_dict = readyelp.read_reviews_to_dict("./train_reviews.json")
    # clean_review_dict(review_dict, user_dict)
    print "Total number of reviews from these users:", len(review_dict)

    common_review_pairs = find_review_pairs_by_friends(user_dict, review_dict)
    print "Total number of friend review pairs of the same business:", len(common_review_pairs)


    klass_list = ["negative", "positive"]
    raw_counts = klass_counts(review_dict, klass_list)
    for klass in klass_list:
        print "Total reviews with " + klass + " sentiment:", raw_counts[klass]
    _homophily_counts(common_review_pairs, review_dict, klass_list)
Пример #12
0
def split_data(train_ratio_of_total=0.5):
    """ Splits the data randomly according to the ratio of training data to the total size of the data set provided.  The default argument of 0.5 splits the data evenly between training and test sets. """
    reviews = readyelp.read_reviews_to_dict("./reviews.json")
    users = readyelp.read_users_to_dict("./users.json")
    clean_review_dict(reviews, users)

    train = []
    test = []

    for review_id in reviews:
        review = reviews[review_id]
        assignment = random.random()
        if assignment <= train_ratio_of_total:
            train.append(review)
        else:
            test.append(review)

    readyelp.write_output(train, "./train_reviews.json")
    readyelp.write_output(test, "./test_reviews.json")
    filter_users()
Пример #13
0
def split_data(train_ratio_of_total = 0.5):
    """ Splits the data randomly according to the ratio of training data to the total size of the data set provided.  The default argument of 0.5 splits the data evenly between training and test sets. """
    reviews = readyelp.read_reviews_to_dict("./reviews.json")
    users = readyelp.read_users_to_dict("./users.json")
    clean_review_dict(reviews, users)

    train = []
    test = []

    for review_id in reviews:
        review = reviews[review_id]
        assignment = random.random()
        if assignment <= train_ratio_of_total:
            train.append(review)
        else:
            test.append(review)

    readyelp.write_output(train, "./train_reviews.json")
    readyelp.write_output(test, "./test_reviews.json")
    filter_users()
Пример #14
0
def main():
    train_reviews = readyelp.read_reviews_to_dict("train_reviews.json")
    test_reviews = readyelp.read_reviews_to_dict("test_reviews.json")
    user_dict = readyelp.read_users_to_dict("users_limited.json")
    train_crf(train_reviews, user_dict)
    crftag(test_reviews, user_dict)
Пример #15
0
def main():
    train_reviews = readyelp.read_reviews_to_dict("train_reviews.json")
    test_reviews = readyelp.read_reviews_to_dict("test_reviews.json")
    user_dict = readyelp.read_users_to_dict("users_limited.json")
    train_crf(train_reviews, user_dict)
    crftag(test_reviews, user_dict)