Exemplo n.º 1
0
    def init_reviews(self):

        print('init_reviews', time.strftime("%H:%M:%S"))

        self.reviews = []
        self.specific_reviews = []
        self.generic_reviews = []

        # for text_review in self.text_reviews:
        #     self.reviews.append(Review(text_review))

        my_file = '/Users/fpena/tmp/reviews_hotel.pkl'
        # my_file = '/Users/fpena/tmp/reviews_restaurant.pkl'
        # my_file = '/Users/fpena/tmp/sentences_hotel.pkl'
        # with open(my_file, 'wb') as write_file:
        #     pickle.dump(self.reviews, write_file, pickle.HIGHEST_PROTOCOL)

        with open(my_file, 'rb') as read_file:
            self.reviews = pickle.load(read_file)

        # self.reviews = self.reviews
        # for review in self.reviews:
        #     print(review)

        cluster_labels = reviews_clusterer.cluster_reviews(self.reviews)
        review_clusters =\
            reviews_clusterer.split_list_by_labels(self.reviews, cluster_labels)
        # print(cluster_labels)

        self.specific_reviews = review_clusters[0]
        self.generic_reviews = review_clusters[1]

        self.all_nouns = context_utils.get_all_nouns(self.reviews)

        context_utils.generate_stats(self.specific_reviews, self.generic_reviews)
Exemplo n.º 2
0
    def test_split_list_by_labels(self):

        lst = ['a', 'b', 'c', 'd', 'e', 'f']
        labels = [2, 0, 0, 1, 1, 0]
        expected_matrix = [['b', 'c', 'f'], ['d', 'e'], ['a']]
        actual_matrix = reviews_clusterer.split_list_by_labels(lst, labels)

        self.assertItemsEqual(actual_matrix, expected_matrix)
Exemplo n.º 3
0
    def test_split_list_by_labels(self):

        lst = ['a', 'b', 'c', 'd', 'e', 'f']
        labels = [2, 0, 0, 1, 1, 0]
        expected_matrix = [
            ['b', 'c', 'f'],
            ['d', 'e'],
            ['a']
        ]
        actual_matrix = reviews_clusterer.split_list_by_labels(lst, labels)

        self.assertItemsEqual(actual_matrix, expected_matrix)
Exemplo n.º 4
0
    def separate_reviews(self):
        """
        Separates the reviews into specific and generic. The separation is done
        by clustering

        """
        # print('separating reviews', time.strftime("%H:%M:%S"))

        cluster_labels = reviews_clusterer.cluster_reviews(self.reviews)
        review_clusters =\
            reviews_clusterer.split_list_by_labels(self.reviews, cluster_labels)

        self.specific_reviews = review_clusters[0]
        self.generic_reviews = review_clusters[1]
Exemplo n.º 5
0
    def init_reviews(self):

        print('init_reviews', time.strftime("%H:%M:%S"))

        # self.reviews = reviews
        self.specific_reviews = []
        self.generic_reviews = []

        # for text_review in self.text_reviews:
        #     self.reviews.append(Review(text_review))

        # my_file = '/Users/fpena/UCC/Thesis/projects/yelp/source/python/topicmodeling/context/reviews_hotel.pkl'
        records_file = '/Users/fpena/UCC/Thesis/datasets/context/stuff/reviews_hotel_shuffled.json'
        reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/stuff/reviews_hotel_shuffled.pkl'
        # my_file = '/Users/fpena/UCC/Thesis/datasets/context/stuff/reviews_restaurant_shuffled.pkl'
        # my_file = '/Users/fpena/tmp/reviews_restaurant.pkl'
        # my_file = '/Users/fpena/tmp/sentences_hotel.pkl'
        # with open(my_file, 'wb') as write_file:
        #     pickle.dump(self.reviews, write_file, pickle.HIGHEST_PROTOCOL)

        # self.records = ETLUtils.load_json_file(records_file)
        #
        # with open(reviews_file, 'rb') as read_file:
        #     self.reviews = pickle.load(read_file)[:100]
        #
        # print(self.records[50]['text'])
        # print(self.reviews[50].text)

        # self.reviews = self.reviews
        # for review in self.reviews:
        #     print(review)

        cluster_labels = reviews_clusterer.cluster_reviews(self.reviews)
        review_clusters =\
            reviews_clusterer.split_list_by_labels(self.reviews, cluster_labels)
        # print(cluster_labels)

        self.specific_reviews = review_clusters[0]
        self.generic_reviews = review_clusters[1]

        self.all_nouns = context_utils.get_all_nouns(self.reviews)

        context_utils.generate_stats(self.specific_reviews, self.generic_reviews)
Exemplo n.º 6
0
def calculate_recall_in_top_n(records,
                              recommender,
                              n,
                              num_folds,
                              split=None,
                              min_score=5.0,
                              cache_reviews=None,
                              reviews_type=None):

    start_time = time.time()
    if split is None:
        split = 1 - (1 / float(num_folds))
    # split = 0.984
    total_recall = 0.
    total_coverage = 0.
    num_cycles = 0.0

    for i in xrange(0, num_folds):
        print('Fold', i)
        print('started training', time.strftime("%Y/%d/%m-%H:%M:%S"))
        start = float(i) / num_folds
        cluster_labels = None
        train_records, test_records = ETLUtils.split_train_test(records,
                                                                split=split,
                                                                start=start)
        if cache_reviews:
            train_reviews, test_reviews = ETLUtils.split_train_test(
                cache_reviews, split=split, start=start)
            if reviews_type is not None:
                cluster_labels = reviews_clusterer.cluster_reviews(
                    test_reviews)
            recommender.reviews = train_reviews
        recommender.load(train_records)

        print('finished training', time.strftime("%Y/%d/%m-%H:%M:%S"))

        if cluster_labels is not None:
            separated_records = reviews_clusterer.split_list_by_labels(
                test_records, cluster_labels)
            if reviews_type == 'specific':
                test_records = separated_records[0]
            if reviews_type == 'generic':
                test_records = separated_records[1]

        positive_reviews = \
            [review for review in test_records if review['overall_rating'] >= min_score]

        if len(positive_reviews) == 0:
            continue

        num_hits = 0.0
        num_predictions = 0.0
        for review in positive_reviews:
            user_id = review['user_id']
            item_id = review['offering_id']
            if not recommender.has_context:
                hit = calculate_is_a_hit(test_records, recommender, user_id,
                                         item_id, n)
            else:
                text_review = review['text']
                hit = calculate_is_a_hit(test_records, recommender, user_id,
                                         item_id, n, text_review)
            if hit is None:
                continue
            if hit:
                num_hits += 1
            num_predictions += 1
            # print('num predictions: %d/%d %s' % (num_predictions, len(positive_reviews), time.strftime("%Y/%d/%m-%H:%M:%S")))

        if num_predictions == 0:
            continue

        recommender.clear()
        recall = num_hits / num_predictions
        coverage = num_predictions / len(positive_reviews)
        print('recall', recall, time.strftime("%Y/%d/%m-%H:%M:%S"))
        print('coverage', coverage, time.strftime("%Y/%d/%m-%H:%M:%S"))
        total_recall += recall
        total_coverage += coverage
        num_cycles += 1

    final_recall = total_recall / num_cycles
    final_coverage = total_coverage / num_cycles
    execution_time = time.time() - start_time

    print('Final Top N Precision: %f' % final_recall)
    print('Final Coverage: %f' % final_coverage)
    print("--- %s seconds ---" % execution_time)

    result = {
        'Top N': final_recall,
        'Coverage': final_coverage,
        'Execution time': execution_time
    }

    return result
Exemplo n.º 7
0
def perform_cross_validation(
        records, recommender, num_folds, cache_reviews=None, reviews_type=None):

    start_time = time.time()
    split = 1 - (1/float(num_folds))
    total_mean_absolute_error = 0.
    total_mean_square_error = 0.
    total_coverage = 0.
    num_cycles = 0

    for i in range(0, num_folds):
        print('Num cycles: %d' % i)
        start = float(i) / num_folds
        cluster_labels = None
        train_records, test_records = ETLUtils.split_train_test(
            records, split=split, start=start)
        if cache_reviews:
            train_reviews, test_reviews = ETLUtils.split_train_test(
                cache_reviews, split=split, start=start)
            if reviews_type is not None:
                cluster_labels = reviews_clusterer.cluster_reviews(test_reviews)
            recommender.reviews = train_reviews
        recommender.load(train_records)

        if cluster_labels is not None:
            separated_records = reviews_clusterer.split_list_by_labels(
                test_records, cluster_labels)
            if reviews_type == 'specific':
                test_records = separated_records[0]
            if reviews_type == 'generic':
                test_records = separated_records[1]

        _, errors, num_unknown_ratings = predict_rating_list(recommender, test_records)
        recommender.clear()
        mean_absolute_error = MeanAbsoluteError.compute_list(errors)
        root_mean_square_error = RootMeanSquareError.compute_list(errors)
        num_samples = len(test_records)
        coverage = float((num_samples - num_unknown_ratings) / num_samples)
        # print('Total length:', len(test))
        # print('Unknown ratings:', num_unknown_ratings)
        # print('Coverage:', coverage)

        if mean_absolute_error is not None:
            total_mean_absolute_error += mean_absolute_error
            total_mean_square_error += root_mean_square_error
            total_coverage += coverage
            num_cycles += 1
        else:
            print('Mean absolute error is None!!!')


    final_mean_absolute_error = total_mean_absolute_error / num_cycles
    final_root_squared_error = total_mean_square_error / num_cycles
    final_coverage = total_coverage / num_cycles
    execution_time = time.time() - start_time

    print('Final mean absolute error: %f' % final_mean_absolute_error)
    print('Final root mean square error: %f' % final_root_squared_error)
    print('Final coverage: %f' % final_coverage)
    print("--- %s seconds ---" % execution_time)

    result = {
        'MAE': final_mean_absolute_error,
        'RMSE': final_root_squared_error,
        'Coverage': final_coverage,
        'Execution time': execution_time
    }

    return result
Exemplo n.º 8
0
def calculate_recall_in_top_n(
        records, recommender, n, num_folds, split=None, min_score=5.0,
        cache_reviews=None, reviews_type=None):

    start_time = time.time()
    if split is None:
        split = 1 - (1/float(num_folds))
    # split = 0.984
    total_recall = 0.
    total_coverage = 0.
    num_cycles = 0.0

    for i in xrange(0, num_folds):
        print('Fold', i)
        print('started training', time.strftime("%Y/%d/%m-%H:%M:%S"))
        start = float(i) / num_folds
        cluster_labels = None
        train_records, test_records = ETLUtils.split_train_test(
            records, split=split, shuffle_data=False, start=start)
        if cache_reviews:
            train_reviews, test_reviews = ETLUtils.split_train_test(
                cache_reviews, split=split, shuffle_data=False, start=start)
            if reviews_type is not None:
                cluster_labels = reviews_clusterer.cluster_reviews(test_reviews)
            recommender.reviews = train_reviews
        recommender.load(train_records)

        print('finished training', time.strftime("%Y/%d/%m-%H:%M:%S"))

        if cluster_labels is not None:
            separated_records = reviews_clusterer.split_list_by_labels(
                test_records, cluster_labels)
            if reviews_type == 'specific':
                test_records = separated_records[0]
            if reviews_type == 'generic':
                test_records = separated_records[1]

        positive_reviews = \
            [review for review in test_records if review['overall_rating'] >= min_score]

        if len(positive_reviews) == 0:
            continue

        num_hits = 0.0
        num_predictions = 0.0
        for review in positive_reviews:
            user_id = review['user_id']
            item_id = review['offering_id']
            if not recommender.has_context:
                hit = calculate_is_a_hit(
                    test_records, recommender, user_id, item_id, n)
            else:
                text_review = review['text']
                hit = calculate_is_a_hit(
                    test_records, recommender, user_id, item_id, n, text_review)
            if hit is None:
                continue
            if hit:
                num_hits += 1
            num_predictions += 1
            # print('num predictions: %d/%d %s' % (num_predictions, len(positive_reviews), time.strftime("%Y/%d/%m-%H:%M:%S")))

        if num_predictions == 0:
            continue

        recommender.clear()
        recall = num_hits / num_predictions
        coverage = num_predictions / len(positive_reviews)
        print('recall', recall, time.strftime("%Y/%d/%m-%H:%M:%S"))
        print('coverage', coverage, time.strftime("%Y/%d/%m-%H:%M:%S"))
        total_recall += recall
        total_coverage += coverage
        num_cycles += 1

    final_recall = total_recall / num_cycles
    final_coverage = total_coverage / num_cycles
    execution_time = time.time() - start_time

    print('Final Top N Precision: %f' % final_recall)
    print('Final Coverage: %f' % final_coverage)
    print("--- %s seconds ---" % execution_time)

    result = {
        'Top N': final_recall,
        'Coverage': final_coverage,
        'Execution time': execution_time
    }

    return result