def calculate_top_n_precision(reviews, recommender, n, min_score, num_folds): start_time = time.time() split = 1 - (1/float(num_folds)) total_precision = 0. num_cycles = 0 for i in xrange(0, num_folds): print('Fold', i ) start = float(i) / num_folds train, test = ETLUtils.split_train_test( reviews, split=split, shuffle_data=False, start=start) recommender.load(train) user_ids = recommender.user_ids for user_id in user_ids: precision = calculate_recommender_precision( test, user_id, recommender, n, min_score) if precision is not None: total_precision += precision num_cycles += 1 final_precision = total_precision / num_cycles execution_time = time.time() - start_time print('Final Top N Precision: %f' % final_precision) print("--- %s seconds ---" % execution_time) result = { 'Top N': final_precision, 'Execution time': execution_time } return result
def calculate_top_n_precision(reviews, recommender, n, min_score, num_folds): start_time = time.time() split = 1 - (1 / float(num_folds)) total_precision = 0. num_cycles = 0 for i in xrange(0, num_folds): print('Fold', i) start = float(i) / num_folds train, test = ETLUtils.split_train_test(reviews, split=split, start=start) recommender.load(train) user_ids = recommender.user_ids for user_id in user_ids: precision = calculate_recommender_precision( test, user_id, recommender, n, min_score) if precision is not None: total_precision += precision num_cycles += 1 final_precision = total_precision / num_cycles execution_time = time.time() - start_time print('Final Top N Precision: %f' % final_precision) print("--- %s seconds ---" % execution_time) result = {'Top N': final_precision, 'Execution time': execution_time} return result
def create_topic_models(): print(Constants._properties) print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S")) records = ETLUtils.load_json_file(Constants.RECORDS_FILE) plant_seeds() num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1 / float(num_folds)) for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i + 1), num_cycles)) if Constants.SHUFFLE_DATA: random.shuffle(records) train_records_list = [] for j in range(num_folds): cv_start = float(j) / num_folds train_records, test_records =\ ETLUtils.split_train_test(records, split=split, start=cv_start) train_records_list.append(train_records) args = zip(train_records_list, [i] * Constants.CROSS_VALIDATION_NUM_FOLDS, range(Constants.CROSS_VALIDATION_NUM_FOLDS)) parallel_context_top_n(args)
def create_single_topic_model(cycle_index, fold_index, check_exists=True): Constants.print_properties() print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S")) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: msg = 'This function shouldn\'t be used when the ' \ 'separate_topic_model_recsys_reviews property is set to True' raise ValueError(msg) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.CROSS_VALIDATION_STRATEGY == 'nested_test': pass elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate': num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE split = 1 - (1 / float(num_folds)) cv_start = float(cycle) / num_folds print('cv_start', cv_start) records, _ = ETLUtils.split_train_test(records, split, cv_start) else: raise ValueError('Unknown cross-validation strategy') utilities.plant_seeds() num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1/float(num_folds)) for i in range(cycle_index+1): if Constants.SHUFFLE_DATA: random.shuffle(records) cv_start = float(fold_index) / num_folds train_records, test_records = \ ETLUtils.split_train_test(records, split=split, start=cv_start) return create_topic_model( train_records, cycle_index, fold_index, check_exists)
def split_data(self): """ We split the data into training and validation NOTE: Beware that there could be users/items that don't appear in the training set in the test set """ self.train_votes, validation_test_votes = ETLUtils.split_train_test( self.corpus.vote_list, split=0.8) self.valid_votes, self.test_votes = ETLUtils.split_train_test( validation_test_votes, split=0.5) for vote in self.train_votes: user = vote.user item = vote.item if user not in self.n_training_per_user: self.n_training_per_user[user] = 0 if item not in self.n_training_per_item: self.n_training_per_item[item] = 0 self.n_training_per_user[user] += 1 self.n_training_per_item[item] += 1 self.train_votes_per_user[user].append(vote) self.train_votes_per_item[item].append(vote)
def create_single_topic_model(cycle_index, fold_index): print(Constants._properties) print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S")) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) plant_seeds() num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1 / float(num_folds)) for i in range(cycle_index + 1): if Constants.SHUFFLE_DATA: random.shuffle(records) cv_start = float(fold_index) / num_folds train_records, test_records = \ ETLUtils.split_train_test(records, split=split, start=cv_start) create_topic_model(train_records, cycle_index, fold_index)
def create_single_topic_model(cycle_index, fold_index): print(Constants._properties) print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S")) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) plant_seeds() num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1/float(num_folds)) for i in range(cycle_index+1): if Constants.SHUFFLE_DATA: random.shuffle(records) cv_start = float(fold_index) / num_folds train_records, test_records = \ ETLUtils.split_train_test(records, split=split, start=cv_start) create_topic_model(train_records, cycle_index, fold_index)
def run(self): utilities.plant_seeds() self.load() records = self.original_records if Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate': num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE split = 1 - (1 / float(num_folds)) cv_start = float(cycle) / num_folds print('cv_start', cv_start) records, _ = ETLUtils.split_train_test(self.original_records, split, cv_start) return self.perform_cross_validation(records) elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_test': return self.perform_cross_validation(records) else: raise ValueError('Unknown cross-validation strategy')
def create_topic_models(): print(Constants._properties) print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S")) records = ETLUtils.load_json_file(Constants.RECORDS_FILE) plant_seeds() num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1/float(num_folds)) for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i+1), num_cycles)) if Constants.SHUFFLE_DATA: random.shuffle(records) train_records_list = [] for j in range(num_folds): cv_start = float(j) / num_folds train_records, test_records =\ ETLUtils.split_train_test(records, split=split, start=cv_start) train_records_list.append(train_records) args = zip( train_records_list, [i] * Constants.CROSS_VALIDATION_NUM_FOLDS, range(Constants.CROSS_VALIDATION_NUM_FOLDS) ) parallel_context_top_n(args)
def calculate_recall_in_top_n(records, recommender, n, num_folds, split=None, min_score=5.0, cache_reviews=None, reviews_type=None): start_time = time.time() if split is None: split = 1 - (1 / float(num_folds)) # split = 0.984 total_recall = 0. total_coverage = 0. num_cycles = 0.0 for i in xrange(0, num_folds): print('Fold', i) print('started training', time.strftime("%Y/%d/%m-%H:%M:%S")) start = float(i) / num_folds cluster_labels = None train_records, test_records = ETLUtils.split_train_test(records, split=split, start=start) if cache_reviews: train_reviews, test_reviews = ETLUtils.split_train_test( cache_reviews, split=split, start=start) if reviews_type is not None: cluster_labels = reviews_clusterer.cluster_reviews( test_reviews) recommender.reviews = train_reviews recommender.load(train_records) print('finished training', time.strftime("%Y/%d/%m-%H:%M:%S")) if cluster_labels is not None: separated_records = reviews_clusterer.split_list_by_labels( test_records, cluster_labels) if reviews_type == 'specific': test_records = separated_records[0] if reviews_type == 'generic': test_records = separated_records[1] positive_reviews = \ [review for review in test_records if review['overall_rating'] >= min_score] if len(positive_reviews) == 0: continue num_hits = 0.0 num_predictions = 0.0 for review in positive_reviews: user_id = review['user_id'] item_id = review['offering_id'] if not recommender.has_context: hit = calculate_is_a_hit(test_records, recommender, user_id, item_id, n) else: text_review = review['text'] hit = calculate_is_a_hit(test_records, recommender, user_id, item_id, n, text_review) if hit is None: continue if hit: num_hits += 1 num_predictions += 1 # print('num predictions: %d/%d %s' % (num_predictions, len(positive_reviews), time.strftime("%Y/%d/%m-%H:%M:%S"))) if num_predictions == 0: continue recommender.clear() recall = num_hits / num_predictions coverage = num_predictions / len(positive_reviews) print('recall', recall, time.strftime("%Y/%d/%m-%H:%M:%S")) print('coverage', coverage, time.strftime("%Y/%d/%m-%H:%M:%S")) total_recall += recall total_coverage += coverage num_cycles += 1 final_recall = total_recall / num_cycles final_coverage = total_coverage / num_cycles execution_time = time.time() - start_time print('Final Top N Precision: %f' % final_recall) print('Final Coverage: %f' % final_coverage) print("--- %s seconds ---" % execution_time) result = { 'Top N': final_recall, 'Coverage': final_coverage, 'Execution time': execution_time } return result
def perform_cross_validation( records, recommender, num_folds, cache_reviews=None, reviews_type=None): start_time = time.time() split = 1 - (1/float(num_folds)) total_mean_absolute_error = 0. total_mean_square_error = 0. total_coverage = 0. num_cycles = 0 for i in range(0, num_folds): print('Num cycles: %d' % i) start = float(i) / num_folds cluster_labels = None train_records, test_records = ETLUtils.split_train_test( records, split=split, start=start) if cache_reviews: train_reviews, test_reviews = ETLUtils.split_train_test( cache_reviews, split=split, start=start) if reviews_type is not None: cluster_labels = reviews_clusterer.cluster_reviews(test_reviews) recommender.reviews = train_reviews recommender.load(train_records) if cluster_labels is not None: separated_records = reviews_clusterer.split_list_by_labels( test_records, cluster_labels) if reviews_type == 'specific': test_records = separated_records[0] if reviews_type == 'generic': test_records = separated_records[1] _, errors, num_unknown_ratings = predict_rating_list(recommender, test_records) recommender.clear() mean_absolute_error = MeanAbsoluteError.compute_list(errors) root_mean_square_error = RootMeanSquareError.compute_list(errors) num_samples = len(test_records) coverage = float((num_samples - num_unknown_ratings) / num_samples) # print('Total length:', len(test)) # print('Unknown ratings:', num_unknown_ratings) # print('Coverage:', coverage) if mean_absolute_error is not None: total_mean_absolute_error += mean_absolute_error total_mean_square_error += root_mean_square_error total_coverage += coverage num_cycles += 1 else: print('Mean absolute error is None!!!') final_mean_absolute_error = total_mean_absolute_error / num_cycles final_root_squared_error = total_mean_square_error / num_cycles final_coverage = total_coverage / num_cycles execution_time = time.time() - start_time print('Final mean absolute error: %f' % final_mean_absolute_error) print('Final root mean square error: %f' % final_root_squared_error) print('Final coverage: %f' % final_coverage) print("--- %s seconds ---" % execution_time) result = { 'MAE': final_mean_absolute_error, 'RMSE': final_root_squared_error, 'Coverage': final_coverage, 'Execution time': execution_time } return result
def calculate_recall_in_top_n( records, recommender, n, num_folds, split=None, min_score=5.0, cache_reviews=None, reviews_type=None): start_time = time.time() if split is None: split = 1 - (1/float(num_folds)) # split = 0.984 total_recall = 0. total_coverage = 0. num_cycles = 0.0 for i in xrange(0, num_folds): print('Fold', i) print('started training', time.strftime("%Y/%d/%m-%H:%M:%S")) start = float(i) / num_folds cluster_labels = None train_records, test_records = ETLUtils.split_train_test( records, split=split, shuffle_data=False, start=start) if cache_reviews: train_reviews, test_reviews = ETLUtils.split_train_test( cache_reviews, split=split, shuffle_data=False, start=start) if reviews_type is not None: cluster_labels = reviews_clusterer.cluster_reviews(test_reviews) recommender.reviews = train_reviews recommender.load(train_records) print('finished training', time.strftime("%Y/%d/%m-%H:%M:%S")) if cluster_labels is not None: separated_records = reviews_clusterer.split_list_by_labels( test_records, cluster_labels) if reviews_type == 'specific': test_records = separated_records[0] if reviews_type == 'generic': test_records = separated_records[1] positive_reviews = \ [review for review in test_records if review['overall_rating'] >= min_score] if len(positive_reviews) == 0: continue num_hits = 0.0 num_predictions = 0.0 for review in positive_reviews: user_id = review['user_id'] item_id = review['offering_id'] if not recommender.has_context: hit = calculate_is_a_hit( test_records, recommender, user_id, item_id, n) else: text_review = review['text'] hit = calculate_is_a_hit( test_records, recommender, user_id, item_id, n, text_review) if hit is None: continue if hit: num_hits += 1 num_predictions += 1 # print('num predictions: %d/%d %s' % (num_predictions, len(positive_reviews), time.strftime("%Y/%d/%m-%H:%M:%S"))) if num_predictions == 0: continue recommender.clear() recall = num_hits / num_predictions coverage = num_predictions / len(positive_reviews) print('recall', recall, time.strftime("%Y/%d/%m-%H:%M:%S")) print('coverage', coverage, time.strftime("%Y/%d/%m-%H:%M:%S")) total_recall += recall total_coverage += coverage num_cycles += 1 final_recall = total_recall / num_cycles final_coverage = total_coverage / num_cycles execution_time = time.time() - start_time print('Final Top N Precision: %f' % final_recall) print('Final Coverage: %f' % final_coverage) print("--- %s seconds ---" % execution_time) result = { 'Top N': final_recall, 'Coverage': final_coverage, 'Execution time': execution_time } return result