def calculate_sparsity(self): """ Returns the percentage of missing ratings in the list of reviews of this ReviewsDatasetAnalyzer :return: the rate of missing ratings (i.e. number of missing ratings / (number of items * number of users)) :raise ValueError: in case an empty list is given """ if not self.reviews: raise ValueError("Can not determine the sparsity for an empty list") user_ids = extractor.get_groupby_list(self.reviews, "user_id") item_ids = extractor.get_groupby_list(self.reviews, "offering_id") non_missing_reviews = 0.0 total_expected_reviews = len(user_ids) * len(item_ids) for user in user_ids: user_reviews = ETLUtils.filter_records(self.reviews, "user_id", [user]) user_items = extractor.get_groupby_list(user_reviews, "offering_id") non_missing_reviews += len(set(item_ids).intersection(set(user_items))) return 1 - non_missing_reviews / total_expected_reviews
def load(self, reviews): self.reviews = reviews self.user_ids = extractor.get_groupby_list(self.reviews, "user_id") self.item_ids = extractor.get_groupby_list(self.reviews, "offering_id") self.num_users = len(self.user_ids) self.num_items = len(self.item_ids) self.data_matrix = create_matrix(self.reviews)
def initialize(self): self.user_ids =\ extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD) self.item_ids =\ extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD) print('total users', len(self.user_ids)) print('total items', len(self.item_ids)) self.user_item_map = self.create_user_item_map() self.find_important_records()
def full_cycle(self): Constants.print_properties() print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S")) utilities.plant_seeds() if self.use_cache and \ os.path.exists(Constants.PROCESSED_RECORDS_FILE): print('Records have already been processed') self.records = \ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) else: self.load_records() if 'yelp' in Constants.ITEM_TYPE: self.transform_yelp_records() elif 'fourcity' in Constants.ITEM_TYPE: self.transform_fourcity_records() self.add_integer_ids() self.clean_reviews() self.remove_duplicate_reviews() self.tag_reviews_language() self.remove_foreign_reviews() self.lemmatize_records() self.remove_users_with_low_reviews() self.remove_items_with_low_reviews() self.count_frequencies() self.shuffle_records() print('total_records: %d' % len(self.records)) self.classify_reviews() self.build_bag_of_words() self.tag_contextual_reviews() # self.load_full_records() self.build_dictionary() self.build_corpus() self.label_review_targets() self.export_records() self.count_specific_generic_ratio() # self.export_to_triplet() rda = ReviewsDatasetAnalyzer(self.records) print('density: %f' % rda.calculate_density_approx()) print('sparsity: %f' % rda.calculate_sparsity_approx()) print('total_records: %d' % len(self.records)) user_ids = \ extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD) item_ids = \ extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD) print('total users', len(user_ids)) print('total items', len(item_ids)) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.separate_recsys_topic_model_records()
def get_unknown_items(reviews, user_id, num_unknown=1000): item_ids = extractor.get_groupby_list(reviews, 'offering_id') user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id]) user_items = extractor.get_groupby_list(user_reviews, 'offering_id') # We calculate which are the items that the user hasn't rated, which is the # items that are in the list item_ids but not in the list user_items s = set(user_items) unknown_items = [x for x in item_ids if x not in s] # TODO: Uncomment this line, the items have to be shuffled # shuffle(unknown_items) return unknown_items[:num_unknown]
def __init__(self, reviews): if not reviews: raise ValueError("Can not analyze an empty list") self.reviews = reviews self.user_ids = extractor.get_groupby_list(self.reviews, "user_id") self.item_ids = extractor.get_groupby_list(self.reviews, "offering_id") self.num_reviews = len(self.reviews) self.num_users = len(self.user_ids) self.num_items = len(self.item_ids) self.data_frame = DataFrame(self.reviews) self.users_count = self.data_frame.groupby("user_id").size() self.items_count = self.data_frame.groupby("offering_id").size()
def __init__(self, reviews): if not reviews: raise ValueError('Can not analyze an empty list') self.reviews = reviews self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id') self.item_ids = extractor.get_groupby_list(self.reviews, 'offering_id') self.num_reviews = len(self.reviews) self.num_users = len(self.user_ids) self.num_items = len(self.item_ids) self.data_frame = DataFrame(self.reviews) self.users_count = self.data_frame.groupby('user_id').size() self.items_count = self.data_frame.groupby('offering_id').size()
def __init__(self, reviews): if not reviews: raise ValueError('Can not analyze an empty list') self.reviews = reviews self.user_ids = extractor.get_groupby_list(self.reviews, Constants.USER_ID_FIELD) self.item_ids = extractor.get_groupby_list(self.reviews, Constants.ITEM_ID_FIELD) self.num_reviews = len(self.reviews) self.num_users = len(self.user_ids) self.num_items = len(self.item_ids) self.data_frame = DataFrame(self.reviews) self.users_count = self.data_frame.groupby(Constants.USER_ID_FIELD).size() self.items_count = self.data_frame.groupby(Constants.ITEM_ID_FIELD).size()
def build_user_clusters(reviews, significant_criteria_ranges=None): """ Builds a series of clusters for users according to their significant criteria. Users that have exactly the same significant criteria will belong to the same cluster. :param reviews: the list of reviews :return: a dictionary where all the keys are the cluster names and the values for those keys are list of users that belong to that cluster """ user_list = extractor.get_groupby_list(reviews, "user_id") user_cluster_dictionary = {} for user in user_list: weights = extractor.get_criteria_weights(reviews, user) significant_criteria, cluster_name = extractor.get_significant_criteria( weights, significant_criteria_ranges ) if cluster_name in user_cluster_dictionary: user_cluster_dictionary[cluster_name].append(user) else: user_cluster_dictionary[cluster_name] = [user] return user_cluster_dictionary
def build_user_clusters(reviews, significant_criteria_ranges=None): """ Builds a series of clusters for users according to their significant criteria. Users that have exactly the same significant criteria will belong to the same cluster. :param reviews: the list of reviews :return: a dictionary where all the keys are the cluster names and the values for those keys are list of users that belong to that cluster """ user_list = extractor.get_groupby_list(reviews, 'user_id') user_cluster_dictionary = {} for user in user_list: weights = extractor.get_criteria_weights(reviews, user) significant_criteria, cluster_name =\ extractor.get_significant_criteria(weights, significant_criteria_ranges) if cluster_name in user_cluster_dictionary: user_cluster_dictionary[cluster_name].append(user) else: user_cluster_dictionary[cluster_name] = [user] return user_cluster_dictionary
def modify_properties_file(fold): with open(CARSKIT_ORIGINAL_CONF_FILE) as read_file: properties = jprops.load_properties(read_file, collections.OrderedDict) recommender = Constants.CARSKIT_RECOMMENDERS ratings_fold_folder = CARSKIT_RATINGS_FOLD_FOLDER % fold if not os.path.exists(ratings_fold_folder): os.makedirs(ratings_fold_folder) modified_file = CARSKIT_MODIFIED_CONF_FILE % (fold, recommender) properties['recommender'] = recommender properties['dataset.ratings.lins'] = \ ratings_fold_folder + 'carskit_train.csv' test_file = ratings_fold_folder + 'carskit_test.csv' properties['evaluation.setup'] = \ 'test-set -f %s --rand-seed 1 --test-view all' % test_file records = ETLUtils.load_json_file( Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE) num_items = \ len(extractor.get_groupby_list(records, Constants.ITEM_ID_FIELD)) extra_items = num_items # extra_items = 10 if Constants.CARSKIT_ITEM_RANKING: properties['item.ranking'] = 'on -topN %d' % extra_items else: properties['item.ranking'] = 'off' with open(modified_file, 'w') as write_file: jprops.store_properties(write_file, properties)
def load(self, records): self.records = records self.ratings_matrix = basic_knn.create_ratings_matrix(records) self.reviews_matrix = create_reviews_matrix(records) self.user_dictionary = extractor.initialize_users(self.records, False) self.user_ids = extractor.get_groupby_list(self.records, 'user_id') # self.lda_model =\ # lda_context_utils.discover_topics(text_reviews, self.num_topics) if self.reviews: lda_based_context = LdaBasedContext() lda_based_context.reviews = self.reviews lda_based_context.init_reviews() else: text_reviews = [] for record in self.records: text_reviews.append(record['text']) lda_based_context = LdaBasedContext(text_reviews) lda_based_context.init_reviews() self.context_rich_topics = lda_based_context.get_context_rich_topics() self.lda_model = lda_based_context.topic_model print('building similarity matrix', time.strftime("%H:%M:%S")) self.context_matrix = self.create_context_matrix(records) self.similarity_matrix = self.create_similarity_matrix() print('finished building similarity matrix', time.strftime("%H:%M:%S"))
def load(self, reviews): self.reviews = reviews self.user_ids = extractor.get_groupby_list(self.reviews, "user_id") self.user_dictionary = extractor.initialize_cluster_users(self.reviews, self._significant_criteria_ranges) self.user_cluster_dictionary = self.build_user_clusters(self.reviews, self._significant_criteria_ranges) if self._similarity_matrix_builder._similarity_metric is not None: self.user_similarity_matrix = self._similarity_matrix_builder.build_similarity_matrix( self.user_dictionary, self.user_ids )
def load(self): print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.original_records =\ ETLUtils.load_json_file( Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE) else: self.original_records =\ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) print('num_records: %d' % len(self.original_records)) user_ids = extractor.get_groupby_list(self.original_records, Constants.USER_ID_FIELD) item_ids = extractor.get_groupby_list(self.original_records, Constants.ITEM_ID_FIELD) print('total users', len(user_ids)) print('total items', len(item_ids))
def build_item_index_map(reviews): user_ids = extractor.get_groupby_list(reviews, 'offering_id') item_index_map = {} index = 0 for user_id in user_ids: item_index_map[user_id] = index index += 1 return item_index_map
def build_user_index_map(reviews): user_ids = extractor.get_groupby_list(reviews, 'user_id') user_index_map = {} index = 0 for user_id in user_ids: user_index_map[user_id] = index index += 1 return user_index_map
def preprocess(self): self.load_records() if 'yelp' in Constants.ITEM_TYPE: self.transform_yelp_records() elif 'fourcity' in Constants.ITEM_TYPE: self.transform_fourcity_records() self.add_integer_ids() self.clean_reviews() self.remove_duplicate_reviews() self.tag_reviews_language() self.remove_foreign_reviews() self.remove_reviews_from_classifier_training_set() self.lemmatize_records() self.remove_users_with_low_reviews() self.remove_items_with_low_reviews() self.count_frequencies() self.shuffle_records() print('total_records: %d' % len(self.records)) self.classify_reviews() self.build_bag_of_words() self.tag_contextual_reviews() # self.load_full_records() self.build_dictionary() self.build_corpus() self.label_review_targets() self.export_records() self.count_specific_generic_ratio() self.export_to_triplet() rda = ReviewsDatasetAnalyzer(self.records) print('density: %f' % rda.calculate_density_approx()) print('sparsity: %f' % rda.calculate_sparsity_approx()) print('total_records: %d' % len(self.records)) user_ids = \ extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD) item_ids = \ extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD) print('total users', len(user_ids)) print('total items', len(item_ids))
def create_user_item_map(records): user_ids = extractor.get_groupby_list(records, constants.USER_ID_FIELD) user_item_map = {} user_count = 0 for user_id in user_ids: user_records =\ ETLUtils.filter_records(records, constants.USER_ID_FIELD, [user_id]) user_items =\ extractor.get_groupby_list(user_records, constants.ITEM_ID_FIELD) user_item_map[user_id] = user_items user_count += 1 # print("user count %d" % user_count), print 'user count: {0}\r'.format(user_count), print return user_item_map
def load(self, reviews): self.reviews = reviews self.user_dictionary =\ extractor.initialize_users( self.reviews, self._similarity_matrix_builder._is_multi_criteria) self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id') if self._similarity_matrix_builder._similarity_metric is not None: self.user_similarity_matrix =\ self._similarity_matrix_builder.build_similarity_matrix( self.user_dictionary, self.user_ids)
def load(self, reviews): self.reviews = reviews self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id') self.user_dictionary =\ extractor.initialize_cluster_users(self.reviews, self._significant_criteria_ranges) self.user_cluster_dictionary = self.build_user_clusters( self.reviews, self._significant_criteria_ranges) if self._similarity_matrix_builder._similarity_metric is not None: self.user_similarity_matrix =\ self._similarity_matrix_builder.build_similarity_matrix( self.user_dictionary, self.user_ids)
def calculate_sparsity_approx(self): """ Returns the approximate percentage of missing ratings in the list of reviews of this ReviewsDatasetAnalyzer. This method is an approximation because it counts two reviews from the same user to the same item as two, when the correct count should be one. This method was created to calculate the sparsity in very big datasets where calculating the exact sparsity can be a very slow process. :return: the rate of approximate missing ratings (i.e. number of missing ratings / (number of reviews)) :raise ValueError: in case an empty list is given """ if not self.reviews: raise ValueError('Can not determine the sparsity for an empty list') user_ids = extractor.get_groupby_list(self.reviews, 'user_id') item_ids = extractor.get_groupby_list(self.reviews, 'offering_id') total_expected_reviews = float(len(user_ids) * len(item_ids)) return 1 - float(len(self.reviews)) / total_expected_reviews
def calculate_sparsity_approx(self): """ Returns the approximate percentage of missing ratings in the list of reviews of this ReviewsDatasetAnalyzer. This method is an approximation because it counts two reviews from the same user to the same item as two, when the correct count should be one. This method was created to calculate the sparsity in very big datasets where calculating the exact sparsity can be a very slow process. :return: the rate of approximate missing ratings (i.e. number of missing ratings / (number of reviews)) :raise ValueError: in case an empty list is given """ if not self.reviews: raise ValueError("Can not determine the sparsity for an empty list") user_ids = extractor.get_groupby_list(self.reviews, "user_id") item_ids = extractor.get_groupby_list(self.reviews, "offering_id") total_expected_reviews = float(len(user_ids) * len(item_ids)) return 1 - float(len(self.reviews)) / total_expected_reviews
def load(self, reviews): self.reviews = reviews self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id') self.item_ids = extractor.get_groupby_list(self.reviews, 'offering_id') self.num_users = len(self.user_ids) self.num_items = len(self.item_ids) self.data_matrix = create_matrix(self.reviews) self.user_index_map = build_user_index_map(reviews) self.item_index_map = build_item_index_map(reviews) R = create_matrix(self.reviews).todense() R = np.array(R) N = len(R) M = len(R[0]) K = 2 P = np.random.rand(N, K) Q = np.random.rand(M, K) # P.fill(0.1) # Q.fill(0.1) self.n_p, self.n_q = matrix_factorization(R, P, Q, K)
def load(self, records): # self.records = records self.user_dictionary = extractor.initialize_users(records, False) self.user_ids = extractor.get_groupby_list(records, 'user_id') if self.has_context: self.load_context(records) self.user_similarity_calculator.load( self.user_ids, self.user_dictionary, self.context_rich_topics) self.neighbourhood_calculator.load( self.user_ids, self.user_dictionary, self.context_rich_topics, self.num_neighbours) self.user_baseline_calculator.load( self.user_dictionary, self.context_rich_topics) self.neighbour_contribution_calculator.load( self.user_baseline_calculator)
def load(self, reviews): self.reviews = reviews self.user_dictionary = extractor.initialize_users(self.reviews, False) self.user_ids = extractor.get_groupby_list(self.reviews, "user_id")
def load(self, reviews): self.reviews = reviews self.user_dictionary = extractor.initialize_users(self.reviews, False) self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id')
def load(self, reviews): self.reviews = reviews self.ratings_matrix = create_ratings_matrix(reviews) self.user_dictionary = extractor.initialize_users(self.reviews, False) self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id') self.similarity_matrix = self.create_similarity_matrix()
def test_build_similarity_matrix_pearson(self): user_dictionary = extractor.initialize_users(reviews_matrix_5, False) user_ids = extractor.get_groupby_list(reviews_matrix_5, "user_id") similarity_matrix_builder = SingleSimilarityMatrixBuilder("pearson") self.assertEqual(pearson_matrix, similarity_matrix_builder.build_similarity_matrix(user_dictionary, user_ids))