def clean_reviews(reviews): """ Returns a copy of the original reviews list with only that are useful for recommendation purposes :param reviews: a list of reviews :return: a copy of the original reviews list with only that are useful for recommendation purposes """ # filtered_reviews = remove_empty_user_reviews(reviews) # filtered_reviews = remove_missing_ratings_reviews(filtered_reviews) # print('Finished remove_missing_ratings_reviews') filtered_reviews = extractor.remove_users_with_low_reviews(reviews, 10) print('Finished remove_users_with_low_reviews') filtered_reviews = extractor.remove_items_with_low_reviews(filtered_reviews, 20) print('Finished remove_single_review_hotels') filtered_reviews = extractor.remove_users_with_low_reviews(filtered_reviews, 10) filtered_reviews = extractor.remove_items_with_low_reviews(filtered_reviews, 20) filtered_reviews = extractor.remove_users_with_low_reviews(filtered_reviews, 10) filtered_reviews = extractor.remove_items_with_low_reviews(filtered_reviews, 20) filtered_reviews = extractor.remove_users_with_low_reviews(filtered_reviews, 10) filtered_reviews = extractor.remove_items_with_low_reviews(filtered_reviews, 20) filtered_reviews = extractor.remove_users_with_low_reviews(filtered_reviews, 10) filtered_reviews = extractor.remove_items_with_low_reviews(filtered_reviews, 20) print('Finished remove_users_with_low_reviews') print('Number of reviews', len(filtered_reviews)) return filtered_reviews
def main(): # reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json" # reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_hotels_shuffled.json" reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_restaurants_shuffled.json" # reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_spas_shuffled.json" # my_records = context_utils.load_reviews(reviews_file) my_records = load_data(reviews_file) print("records:", len(my_records)) my_num_topics = 150 print("\n***************************\n") # my_records = load_data(reviews_file) # my_records = extractor.remove_users_with_low_reviews(my_records, 200) my_records = extractor.remove_users_with_low_reviews(my_records, 200) # shuffle(my_records) my_index = 0 my_reviews = [] for record in my_records: my_index += 1 review = Review(record['text']) review.id = record['review_id'] my_reviews.append(review) print('index', my_index)
def parallel_run_topn_test( records_file, recommenders, binary_reviews_file, reviews_type=None): records = context_recommender_tests.load_records(records_file) records = extractor.remove_users_with_low_reviews(records, 20) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 split = 0.986 top_n = 10 min_like_score = 5.0 args = itertools.product( [records], recommenders, [top_n], [num_folds], [split], [min_like_score], [binary_reviews], [reviews_type] ) print('Total recommenders: %d' % (len(recommenders))) pool = Pool() print('Total CPUs: %d' % pool._processes) results_list = pool.map(run_topn_test_wrapper, args) pool.close() pool.join() # After we have finished executing, we process the results dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds dataset_info_map['min_like_score'] = min_like_score dataset_info_map['top_n'] = top_n results_log_list = [] for recommender, results in zip(recommenders, results_list): results_log_list.append(context_recommender_tests.process_topn_results( recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-topn-results-parallel' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t') return results_list
def remove_users_with_low_reviews(self): print('%s: remove users with low reviews' % time.strftime("%Y/%m/%d-%H:%M:%S")) # Remove from the dataset users with a low number of reviews min_reviews_per_user = Constants.MIN_REVIEWS_PER_USER if min_reviews_per_user is None or min_reviews_per_user < 2: return self.records = extractor.remove_users_with_low_reviews( self.records, min_reviews_per_user)
def parallel_run_topn_test(records_file, recommenders, binary_reviews_file, reviews_type=None): records = context_recommender_tests.load_records(records_file) records = extractor.remove_users_with_low_reviews(records, 20) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 split = 0.986 top_n = 10 min_like_score = 5.0 args = itertools.product([records], recommenders, [top_n], [num_folds], [split], [min_like_score], [binary_reviews], [reviews_type]) print('Total recommenders: %d' % (len(recommenders))) pool = Pool() print('Total CPUs: %d' % pool._processes) results_list = pool.map(run_topn_test_wrapper, args) pool.close() pool.join() # After we have finished executing, we process the results dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds dataset_info_map['min_like_score'] = min_like_score dataset_info_map['top_n'] = top_n results_log_list = [] for recommender, results in zip(recommenders, results_list): results_log_list.append( context_recommender_tests.process_topn_results( recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-topn-results-parallel' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t') return results_list
def clean_reviews(reviews): """ Returns a copy of the original reviews list with only that are useful for recommendation purposes :param reviews: a list of reviews :return: a copy of the original reviews list with only that are useful for recommendation purposes """ # filtered_reviews = remove_empty_user_reviews(reviews) # filtered_reviews = remove_missing_ratings_reviews(filtered_reviews) # print('Finished remove_missing_ratings_reviews') filtered_reviews = extractor.remove_users_with_low_reviews(reviews, 10) print('Finished remove_users_with_low_reviews') filtered_reviews = extractor.remove_items_with_low_reviews( filtered_reviews, 20) print('Finished remove_single_review_hotels') filtered_reviews = extractor.remove_users_with_low_reviews( filtered_reviews, 10) filtered_reviews = extractor.remove_items_with_low_reviews( filtered_reviews, 20) filtered_reviews = extractor.remove_users_with_low_reviews( filtered_reviews, 10) filtered_reviews = extractor.remove_items_with_low_reviews( filtered_reviews, 20) filtered_reviews = extractor.remove_users_with_low_reviews( filtered_reviews, 10) filtered_reviews = extractor.remove_items_with_low_reviews( filtered_reviews, 20) filtered_reviews = extractor.remove_users_with_low_reviews( filtered_reviews, 10) filtered_reviews = extractor.remove_items_with_low_reviews( filtered_reviews, 20) print('Finished remove_users_with_low_reviews') print('Number of reviews', len(filtered_reviews)) return filtered_reviews
def main(): reviews_file =\ "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json" my_records = load_data(reviews_file) my_ratings_matrix = create_ratings_matrix(my_records) my_records = extractor.remove_users_with_low_reviews(my_records, 1) print(len(my_records)) # print(len(my_ratings_matrix)) # basic_knn = BasicKNN(1) # basic_knn.load(my_records) # for record in my_records: # print(basic_knn.predict_rating(record['user_id'], record['offering_id'])) # print(basic_knn.predict_rating('qLCpuCWCyPb4G2vN-WZz-Q', '8ZwO9VuLDWJOXmtAdc7LXQ')) # 4 # print(basic_knn.predict_rating('rVlgz-MGYRPa8UzTYO0RGQ', 'c0iszTWZwYtO3TgBx0Z0fQ')) # 2 # print(basic_knn.predict_rating('4o7r-QSYhOkxpxRMqpXcCg', 'EcHuaHD9IcoPEWNsU8vDTw')) # 4 # print(basic_knn.predict_rating('msgAEWFbD4df0EvyOR3TnQ', 'EcHuaHD9IcoPEWNsU8vDTw')) # 5 shuffle(my_records) # Split 80-20 and see the results num_records = len(my_records) num_unknown_records = 0 training_size = int(num_records*0.8) my_train_data = my_records[:training_size] my_test_data = my_records[training_size:] basic_knn = BasicKNN(None) basic_knn.load(my_train_data) # basic_knn.load(my_records) # recommender_evaluator.perform_cross_validation(my_records, basic_knn, 3) # precision_in_top_n.calculate_top_n_precision(my_records, basic_knn, 10000, 5.0, 5) precision_in_top_n.calculate_recall_in_top_n(my_records, basic_knn, 10, 5)