def test_Recall(self): from Legacy.Base.Evaluation.metrics import recall pos_items = np.asarray([2, 4, 5, 10]) ranked_list_1 = np.asarray([1, 2, 3, 4, 5]) ranked_list_2 = np.asarray([10, 5, 2, 4, 3]) ranked_list_3 = np.asarray([1, 3, 6, 7, 8]) is_relevant = np.in1d(ranked_list_1, pos_items, assume_unique=True) self.assertTrue(np.allclose(recall(is_relevant, pos_items), 3. / 4)) is_relevant = np.in1d(ranked_list_2, pos_items, assume_unique=True) self.assertTrue(np.allclose(recall(is_relevant, pos_items), 1.0)) is_relevant = np.in1d(ranked_list_3, pos_items, assume_unique=True) self.assertTrue(np.allclose(recall(is_relevant, pos_items), 0.0))
def evaluateRecommender(self, recommender_object): """ :param recommender_object: the trained recommender object, a BaseRecommender subclass :param URM_test_list: list of URMs to test the recommender against, or a single URM object :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff """ results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict(self.n_items, self.n_users, recommender_object.URM_train, self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) start_time = time.time() start_time_print = time.time() n_users_evaluated = 0 if self.ignore_items_flag: recommender_object.set_items_to_ignore(self.ignore_items_ID) for test_user in self.usersToEvaluate: # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(test_user) relevant_items_rating = self.get_user_test_ratings(test_user) n_users_evaluated += 1 items_to_compute = self._get_user_specific_items_to_compute(test_user) recommended_items, all_items_predicted_ratings = recommender_object.recommend(np.atleast_1d(test_user), exclude_seen=self.exclude_seen, at= self.max_cutoff, remove_top_pop_flag=False, items_to_compute = items_to_compute, remove_CustomItems_flag=self.ignore_items_flag, return_scores = True ) assert len(recommended_items) == 1, "{}: recommended_items contained recommendations for {} users, expected was {}".format( self.EVALUATOR_NAME, len(recommended_items), 1) assert all_items_predicted_ratings.shape[0] == 1, "{}: all_items_predicted_ratings contained scores for {} users, expected was {}".format( self.EVALUATOR_NAME, all_items_predicted_ratings.shape[0], 1) assert all_items_predicted_ratings.shape[1] == self.n_items, "{}: all_items_predicted_ratings contained scores for {} items, expected was {}".format( self.EVALUATOR_NAME, all_items_predicted_ratings.shape[1], self.n_items) recommended_items = np.array(recommended_items[0]) user_rmse = rmse(all_items_predicted_ratings[0], relevant_items, relevant_items_rating) recommender_object.reset_items_to_ignore() is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[EvaluatorMetrics.ROC_AUC.value] += roc_auc(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.PRECISION.value] += precision(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.value] += precision_recall_min_denominator(is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[EvaluatorMetrics.RECALL.value] += recall(is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) results_current_cutoff[EvaluatorMetrics.HIT_RATE.value] += is_relevant_current_cutoff.sum() results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.RMSE.value] += user_rmse results_current_cutoff[EvaluatorMetrics.MRR.value].add_recommendations(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.MAP.value].add_recommendations(is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.NOVELTY.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(recommended_items_current_cutoff, test_user) results_current_cutoff[EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.value].add_recommendations(recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[EvaluatorMetrics.DIVERSITY_SIMILARITY.value].add_recommendations(recommended_items_current_cutoff) if time.time() - start_time_print > 30 or n_users_evaluated==len(self.usersToEvaluate): elapsed_time = time.time()-start_time new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time) print("{}: Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}".format( self.EVALUATOR_NAME, n_users_evaluated, 100.0* float(n_users_evaluated)/len(self.usersToEvaluate), new_time_value, new_time_unit, float(n_users_evaluated)/elapsed_time)) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() if (n_users_evaluated > 0): for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] for key in results_current_cutoff.keys(): value = results_current_cutoff[key] if isinstance(value, Metrics_Object): results_current_cutoff[key] = value.get_metric_value() else: results_current_cutoff[key] = value/n_users_evaluated precision_ = results_current_cutoff[EvaluatorMetrics.PRECISION.value] recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value] if precision_ + recall_ != 0: # F1 micro averaged: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.104.8244&rep=rep1&type=pdf results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * (precision_ * recall_) / (precision_ + recall_) else: print("WARNING: No users had a sufficient number of relevant items") if self.ignore_items_flag: recommender_object.reset_items_to_ignore() results_run_string = get_result_string(results_dict) return (results_dict, results_run_string)
def _run_evaluation_on_selected_users(self, recommender_object, usersToEvaluate, block_size = None): if block_size is None: block_size = min(1000, int(1e8/self.n_items)) start_time = time.time() start_time_print = time.time() results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict(self.n_items, self.n_users, recommender_object.get_URM_train(), self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) n_users_evaluated = 0 # Start from -block_size to ensure it to be 0 at the first block user_batch_start = 0 user_batch_end = 0 while user_batch_start < len(self.usersToEvaluate): user_batch_end = user_batch_start + block_size user_batch_end = min(user_batch_end, len(usersToEvaluate)) test_user_batch_array = np.array(usersToEvaluate[user_batch_start:user_batch_end]) user_batch_start = user_batch_end # Compute predictions for a batch of users using vectorization, much more efficient than computing it one at a time recommended_items_batch_list, scores_batch = recommender_object.recommend(test_user_batch_array, exclude_seen=self.exclude_seen, at= self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag, return_scores = True ) assert len(recommended_items_batch_list) == len(test_user_batch_array), "{}: recommended_items_batch_list contained recommendations for {} users, expected was {}".format( self.EVALUATOR_NAME, len(recommended_items_batch_list), len(test_user_batch_array)) assert scores_batch.shape[0] == len(test_user_batch_array), "{}: scores_batch contained scores for {} users, expected was {}".format( self.EVALUATOR_NAME, scores_batch.shape[0], len(test_user_batch_array)) assert scores_batch.shape[1] == self.n_items, "{}: scores_batch contained scores for {} items, expected was {}".format( self.EVALUATOR_NAME, scores_batch.shape[1], self.n_items) # Compute recommendation quality for each user in batch for batch_user_index in range(len(recommended_items_batch_list)): test_user = test_user_batch_array[batch_user_index] relevant_items = self.get_user_relevant_items(test_user) relevant_items_rating = self.get_user_test_ratings(test_user) all_items_predicted_ratings = scores_batch[batch_user_index] user_rmse = rmse(all_items_predicted_ratings, relevant_items, relevant_items_rating) # Being the URM CSR, the indices are the non-zero column indexes recommended_items = recommended_items_batch_list[batch_user_index] is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) n_users_evaluated += 1 for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[EvaluatorMetrics.ROC_AUC.value] += roc_auc(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.PRECISION.value] += precision(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.value] += precision_recall_min_denominator(is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[EvaluatorMetrics.RECALL.value] += recall(is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) results_current_cutoff[EvaluatorMetrics.HIT_RATE.value] += is_relevant_current_cutoff.sum() results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.RMSE.value] += user_rmse results_current_cutoff[EvaluatorMetrics.MRR.value].add_recommendations(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.MAP.value].add_recommendations(is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.NOVELTY.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(recommended_items_current_cutoff, test_user) results_current_cutoff[EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.value].add_recommendations(recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[EvaluatorMetrics.DIVERSITY_SIMILARITY.value].add_recommendations(recommended_items_current_cutoff) if time.time() - start_time_print > 30 or n_users_evaluated==len(self.usersToEvaluate): elapsed_time = time.time()-start_time new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time) print("{}: Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}".format( self.EVALUATOR_NAME, n_users_evaluated, 100.0* float(n_users_evaluated)/len(self.usersToEvaluate), new_time_value, new_time_unit, float(n_users_evaluated)/elapsed_time)) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() return results_dict, n_users_evaluated