def dataset_bucket_analysis_by_field(field): # Set the dataset hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'} Constants.update_properties(hotel_dataset_properties) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) print('Loaded %d records' % len(records)) user_frequency_map = {} for record in records: user_id = record[field] if user_id not in user_frequency_map: user_frequency_map[user_id] = 0 user_frequency_map[user_id] += 1 print('There is a total of %d %ss' % (len(user_frequency_map), field)) sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True) print(sorted_x[0]) print(sorted_x[1]) print(sorted_x[2]) # print(user_frequency_map) # Number of reviews per user rda = ReviewsDatasetAnalyzer(records) users_summary = rda.summarize_reviews_by_field(field) print('Average number of reviews per %s: %f' % (field, float(rda.num_reviews) / rda.num_users)) users_summary.plot(kind='line', rot=0) pandas.set_option('display.max_rows', len(users_summary)) print(users_summary) pandas.reset_option('display.max_rows')
def full_cycle(self): Constants.print_properties() print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S")) utilities.plant_seeds() if self.use_cache and \ os.path.exists(Constants.PROCESSED_RECORDS_FILE): print('Records have already been processed') self.records = \ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) else: self.load_records() if 'yelp' in Constants.ITEM_TYPE: self.transform_yelp_records() elif 'fourcity' in Constants.ITEM_TYPE: self.transform_fourcity_records() self.add_integer_ids() self.clean_reviews() self.remove_duplicate_reviews() self.tag_reviews_language() self.remove_foreign_reviews() self.lemmatize_records() self.remove_users_with_low_reviews() self.remove_items_with_low_reviews() self.count_frequencies() self.shuffle_records() print('total_records: %d' % len(self.records)) self.classify_reviews() self.build_bag_of_words() self.tag_contextual_reviews() # self.load_full_records() self.build_dictionary() self.build_corpus() self.label_review_targets() self.export_records() self.count_specific_generic_ratio() # self.export_to_triplet() rda = ReviewsDatasetAnalyzer(self.records) print('density: %f' % rda.calculate_density_approx()) print('sparsity: %f' % rda.calculate_sparsity_approx()) print('total_records: %d' % len(self.records)) user_ids = \ extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD) item_ids = \ extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD) print('total users', len(user_ids)) print('total items', len(item_ids)) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.separate_recsys_topic_model_records()
def preprocess(self): self.load_records() if 'yelp' in Constants.ITEM_TYPE: self.transform_yelp_records() elif 'fourcity' in Constants.ITEM_TYPE: self.transform_fourcity_records() self.add_integer_ids() self.clean_reviews() self.remove_duplicate_reviews() self.tag_reviews_language() self.remove_foreign_reviews() self.remove_reviews_from_classifier_training_set() self.lemmatize_records() self.remove_users_with_low_reviews() self.remove_items_with_low_reviews() self.count_frequencies() self.shuffle_records() print('total_records: %d' % len(self.records)) self.classify_reviews() self.build_bag_of_words() self.tag_contextual_reviews() # self.load_full_records() self.build_dictionary() self.build_corpus() self.label_review_targets() self.export_records() self.count_specific_generic_ratio() self.export_to_triplet() rda = ReviewsDatasetAnalyzer(self.records) print('density: %f' % rda.calculate_density_approx()) print('sparsity: %f' % rda.calculate_sparsity_approx()) print('total_records: %d' % len(self.records)) user_ids = \ extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD) item_ids = \ extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD) print('total users', len(user_ids)) print('total items', len(item_ids))
def test_count_items_in_common(self): expected_value = {1: 23, 2: 4, 3: 1} rda = ReviewsDatasetAnalyzer(reviews) actual_value = rda.count_items_in_common() self.assertEqual(expected_value, actual_value) expected_value = {0: 1} rda = ReviewsDatasetAnalyzer(reviews_small) actual_value = rda.count_items_in_common() self.assertEqual(expected_value, actual_value)
def test_calculate_sparsity_approx(self): expected_value = 0. rda = ReviewsDatasetAnalyzer(reviews_matrix_3) actual_value = rda.calculate_sparsity() self.assertEqual(expected_value, actual_value) expected_value = 1 - 6./9 rda = ReviewsDatasetAnalyzer(reviews_matrix_3_1) actual_value = rda.calculate_sparsity() self.assertEqual(expected_value, actual_value) expected_value = 1 - 3./9 rda = ReviewsDatasetAnalyzer(reviews_matrix_3_2) actual_value = rda.calculate_sparsity() self.assertEqual(expected_value, actual_value) expected_value = 1 - 15./24 rda = ReviewsDatasetAnalyzer(reviews) actual_value = rda.calculate_sparsity_approx() self.assertEqual(expected_value, actual_value)
def generate_report(reviews, dataset_name, file_name, load_reviews_code): nb = nbf.new_notebook() title = '# ' + dataset_name + ' Dataset Analysis' title_cell = nbf.new_text_cell(u'markdown', title) rda = ReviewsDatasetAnalyzer(reviews) num_reviews = len(rda.reviews) num_users = len(rda.user_ids) num_items = len(rda.item_ids) user_avg_reviews = float(num_reviews) / num_users item_avg_reviews = float(num_reviews) / num_items sparsity = rda.calculate_sparsity_approx() fact_sheet_text =\ '## Fact Sheet\n' +\ 'The ' + dataset_name + ' contains:\n' +\ '* ' + str(num_reviews) + ' reviews\n' +\ '* Made by ' + str(num_users) + ' users\n' +\ '* About ' + str(num_items) + ' items\n' +\ '* It has an approximated sparsity of ' + str(sparsity) + '\n' +\ '\nNow we are going to analyze the number of reviews per user and ' \ 'per item' fact_sheet_cell = nbf.new_text_cell(u'markdown', fact_sheet_text) reviews_analysis_code =\ 'import sys\n' +\ 'sys.path.append(\'/Users/fpena/UCC/Thesis/projects/yelp/source/python\')\n' +\ 'from etl import ETLUtils\n\n' +\ 'from etl.reviews_dataset_analyzer import ReviewsDatasetAnalyzer\n' +\ '\n# Load reviews\n' + load_reviews_code + '\n' +\ 'rda = ReviewsDatasetAnalyzer(reviews)\n' reviews_analysis_cell = nbf.new_code_cell(reviews_analysis_code) user_analysis_text =\ '## Users Reviews Analysis\n' +\ '* The average number of reviews per user is ' + str(user_avg_reviews) + '\n' +\ '* The minimum number of reviews a user has is ' + str(min(rda.users_count)) + '\n' +\ '* The maximum number of reviews a user has is ' + str(max(rda.users_count)) user_analysis_cell = nbf.new_text_cell(u'markdown', user_analysis_text) counts_per_user_code =\ '# Number of reviews per user\n' +\ 'users_summary = rda.summarize_reviews_by_field(\'user_id\')\n' +\ 'print(\'Average number of reviews per user\', float(rda.num_reviews)/rda.num_users)\n' +\ 'users_summary.plot(kind=\'line\', rot=0)' counts_per_user_cell = nbf.new_code_cell(counts_per_user_code) item_analysis_text =\ '## Items Reviews Analysis\n' +\ '* The average number of reviews per item is ' + str(item_avg_reviews) + '\n' +\ '* The minimum number of reviews an item has is ' + str(min(rda.items_count)) + '\n' +\ '* The maximum number of reviews an item has is ' + str(max(rda.items_count)) item_analysis_cell = nbf.new_text_cell(u'markdown', item_analysis_text) counts_per_item_code =\ '# Number of reviews per item\n' +\ 'items_summary = rda.summarize_reviews_by_field(\'offering_id\')\n' +\ 'print(\'Average number of reviews per item\', float(rda.num_reviews)/rda.num_items)\n' +\ 'items_summary.plot(kind=\'line\', rot=0)' counts_per_item_cell = nbf.new_code_cell(counts_per_item_code) common_items_text =\ '## Number of items 2 users have in common\n' +\ 'In this section we are going to count the number of items two ' \ 'users have in common' common_items_text_cell = nbf.new_text_cell(u'markdown', common_items_text) common_items_code =\ '# Number of items 2 users have in common\n' +\ 'common_item_counts = rda.count_items_in_common()\n' +\ 'plt.plot(common_item_counts.keys(), common_item_counts.values())\n' common_items_code_cell = nbf.new_code_cell(common_items_code) common_items_box_code =\ 'from pylab import boxplot\n' +\ 'my_data = [key for key, value in common_item_counts.iteritems() for i in xrange(value)]\n' +\ 'mean_common_items = float(sum(my_data))/len(my_data)\n' +\ 'print(\'Average number of common items between two users:\', mean_common_items)\n' +\ 'boxplot(my_data)' common_items_box_cell = nbf.new_code_cell(common_items_box_code) cells = [] cells.append(title_cell) cells.append(fact_sheet_cell) cells.append(reviews_analysis_cell) cells.append(user_analysis_cell) cells.append(counts_per_user_cell) cells.append(item_analysis_cell) cells.append(counts_per_item_cell) cells.append(common_items_text_cell) cells.append(common_items_code_cell) cells.append(common_items_box_cell) nb['worksheets'].append(nbf.new_worksheet(cells=cells)) with open(file_name, 'w') as f: nbf.write(nb, f, 'ipynb')