def evaluate(ranking, gold): for qid in gold: gold_sorted = sorted(gold[qid], key=itemgetter(2), reverse=True) pred_sorted = ranking[qid] pred_sorted = sorted(pred_sorted, key=itemgetter(2), reverse=True) gold[qid], ranking[qid] = [], [] for i, row in enumerate(gold_sorted): relevant, gold_score, aid = row gold[qid].append((relevant, gold_score, aid)) pred_score = pred_sorted[i][1] ranking[qid].append((relevant, pred_score, aid)) for qid in gold: # Sort by IR score. gold_sorted = sorted(gold[qid], key=itemgetter(1), reverse=True) # Sort by SVM prediction score. pred_sorted = ranking[qid] pred_sorted = sorted(pred_sorted, key=itemgetter(1), reverse=True) gold[qid] = [rel for rel, score, aid in gold_sorted] ranking[qid] = [rel for rel, score, aid in pred_sorted] map_gold = metrics.map(gold, 10) map_pred = metrics.map(ranking, 10) return map_gold, map_pred
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=50, verbose=False, reranking_th=-100.0, ignore_noanswer=False, ignore_allanswer=False): ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer, ignore_allanswer=ignore_allanswer) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir) map_svm = metrics.map(svm) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) ''' print "%13s %5s" %("IR", "SVM") print "MRR: %5.2f %5.2f" %(mrr_se, mrr_svm) print "MAP: %5.4f %5.4f" %(map_se, map_svm) print "AvgRec: %5.2f %5.2f" %(avg_acc1_ir, avg_acc1_svm) print "%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SVM", "IR", "SVM", "IR", "SVM", "IR", "SVM") ''' rec1_se =-10 rec1_svm = -10 for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): #print "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2) if (rec1_se<-5): rec1_se = p_se rec1_svm = p_svm ''' print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X" ''' print "Table view" print " MRR MAP P@1" print "REF_FILE %5.2f %5.2f %5.2f" % (mrr_se, map_se*100, rec1_se) print "SVM %5.2f %5.2f %5.2f" % (mrr_svm, map_svm*100, rec1_svm)
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * ( conf_matrix['true']['true'] + conf_matrix['false']['false']) / ( conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print "acc\tf1\tMAP\tMRR\tAvgRec" print "%.4f %4.4f %4.4f %4.4f %4.4f" % (acc, f1, map_svm, mrr_svm, avg_acc1_svm)
def eval_search_engine(res_fname, format, th=50): ir = read_res_file(res_fname, format) # evaluate IR rec = metrics.recall_of_1(ir, th) acc = metrics.accuracy(ir, th) acc1 = metrics.accuracy1(ir, th) acc2 = metrics.accuracy2(ir, th) mrr = metrics.mrr(ir, th) # MAP map_ir = metrics.map(ir) print "%10s" % "IR" print "MRR: %5.2f" % mrr print "MAP: %5.2f" % map_ir for i, (r, a, a1, a2) in enumerate(zip(rec, acc, acc1, acc2), 1): print "REC-1@%02d: %6.2f ACC@%02d: %6.2f AC1@%02d: %6.2f AC2@%02d: %4.0f" % ( i, r, i, a, i, a1, i, a2) print print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X"
def eval_search_engine(res_fname, format, th=50): ir = read_res_file(res_fname, format) # evaluate IR rec = metrics.recall_of_1(ir, th) acc = metrics.accuracy(ir, th) acc1 = metrics.accuracy1(ir, th) acc2 = metrics.accuracy2(ir, th) mrr = metrics.mrr(ir, th) # MAP map_ir = metrics.map(ir) print "%10s" %"IR" print "MRR: %5.2f" % mrr print "MAP: %5.2f" % map_ir for i, (r, a, a1, a2) in enumerate(zip(rec, acc, acc1, acc2), 1): print "REC-1@%02d: %6.2f ACC@%02d: %6.2f AC1@%02d: %6.2f AC2@%02d: %4.0f" %(i, r, i, a, i, a1, i, a2) print print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X"
def eval_reranker(resPredIterable, th=10, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(resPredIterable, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * (conf_matrix['true']['true'] + conf_matrix['false']['false']) / (conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) mrr_svm = metrics.mrr(svm, th) map_svm = metrics.map(svm, th) scores = {} scores['map'] = map_svm scores['accuracy'] = acc scores['precision'] = p scores['recall'] = r scores['f1'] = f1 scores['mrr'] = mrr_svm return scores
def check_engine_quality(self, query_num, list_of_docs): """ :param query_num: :param list_of_docs: :return: no return. prints metrics of the query. precision, recall, map. """ benchmark_path = "data\\benchmark_lbls_train.csv" df = pd.read_csv(benchmark_path) df_prec = df[df['query'] == query_num] df_prec = df_prec[df_prec['tweet'].isin(list_of_docs)] dict_for_data = df_prec.set_index('tweet')['y_true'].to_dict() rmv_lst = [] ranking = [] # Add to list for rank for doc in list_of_docs: try: ranking.append(dict_for_data[int(doc)]) except: rmv_lst.append(doc) for d in rmv_lst: list_of_docs.remove(d) data_df = pd.DataFrame({ 'query': query_num, 'tweet': list_of_docs, 'y_true': ranking }) df_rec = df[df['query'] == query_num] recall_total = len(df_rec[df_rec['y_true'] == 1.0]) # print("total Relevant doc found with tag 1 :" , len (data_df[data_df['y_true'] == 1.0])) # print("total NON relevant doc found with tag 0 :" , len (data_df[data_df['y_true'] == 0])) # print("found total of", len(df_prec), "tagged docs") # Calculate and print prec5 = metrics.precision_at_n(data_df, query_num, 5) prec10 = metrics.precision_at_n(data_df, query_num, 10) prec50 = metrics.precision_at_n(data_df, query_num, 50) prec_total = metrics.precision(data_df, True, query_number=query_num) map_of_query = metrics.map(data_df) recall_val = metrics.recall_single(data_df, recall_total, query_num) self.map_list.append(map_of_query) self.prec5_list.append(prec5) self.prec10_list.append(prec10) self.prec50_list.append(prec50) self.prec_total_list.append(prec_total) self.recall_list.append(recall_val) print() print("precision at 5 of query", query_num, "is :", prec5) print("precision at 10 of query", query_num, "is :", prec10) print("precision at 50 of query", query_num, "is :", prec50) print("precision of query", query_num, "is :", prec_total) print("recall of query", query_num, "is :", recall_val) print("map of query", query_num, "is :", map_of_query)
def get_cv_evaluation_results(qid_aid_label_list, y_pred): predictions_dict = get_cv_ranked_predictions_dict(qid_aid_label_list, y_pred) logging.debug("Num of questions: %d" % (len(predictions_dict))) mrr_score = m.mrr(predictions_dict, 1000) map_score = m.map(predictions_dict) * 100 p1_score = m.recall_of_1(predictions_dict, 1000)[0] return mrr_score, map_score, p1_score
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=50, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir) map_svm = metrics.map(svm) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print "%13s %5s" %("IR", "SVM") print "MRR: %5.2f %5.2f" %(mrr_se, mrr_svm) print "MAP: %5.4f %5.4f" %(map_se, map_svm) print "AvgRec: %5.2f %5.2f" %(avg_acc1_ir, avg_acc1_svm) print "%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SVM", "IR", "SVM", "IR", "SVM", "IR", "SVM") for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): print "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2) print print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X"
def get_evaluation_results(df, y_pred, skip_all_positives_and_all_negatives=True): predictions_dict = get_ranked_predictions_dict( df, y_pred, skip_all_positives_and_all_negatives= skip_all_positives_and_all_negatives) logging.debug("Num of questions: %d" % (len(predictions_dict))) mrr_score = m.mrr(predictions_dict, 1000) map_score = m.map(predictions_dict) * 100 p1_score = m.recall_of_1(predictions_dict, 1000)[0] return mrr_score, map_score, p1_score
def cal_score(ref_lines, probs): reranking_th = 0.0 line_count = 0 pred_lines = defaultdict(list) for ref_line in ref_lines: qid, aid, lbl = ref_line[0], ref_line[1], ref_line[2] pred_lines[qid].append((lbl, probs[line_count][0], aid)) line_count += 1 # for qid in pred_lines.keys(): # candidates = pred_lines[qid] # if all(relevant == "false" for relevant, _, _ in candidates): # del pred_lines[qid] for qid in pred_lines.keys(): pred_sorted = pred_lines[qid] max_score = max([score for rel, score, aid in pred_sorted]) if max_score >= reranking_th: pred_sorted = sorted(pred_sorted, key=itemgetter(1), reverse=True) pred_lines[qid] = [rel for rel, score, aid in pred_sorted] MAP = metrics.map(pred_lines, 10) MRR = metrics.mrr(pred_lines, 10) return MAP, MRR
def test(engine, options): queries = pd.read_csv(os.path.join('data', 'queries_train.tsv'), sep='\t') bench_lbls = pd.read_csv(os.path.join('data', 'benchmark_lbls_train.csv'), dtype={ 'query': int, 'tweet': str, 'y_true': int }) q2n_relevant = bench_lbls.groupby('query')['y_true'].sum().to_dict() queries_results = [] q_times = [] for i, row in queries.iterrows(): q_id = row['query_id'] q_keywords = row['keywords'] start_time = time.time() q_n_res, q_res = engine.search(q_keywords, options['methods']) end_time = time.time() q_time = end_time - start_time q_times.append(q_time) queries_results.extend([(q_id, str(doc_id)) for doc_id in q_res]) if q_time > 10: print(f'Query time exceeded: {options}') queries_results = pd.DataFrame(queries_results, columns=['query', 'tweet']) q_results_labeled = pd.merge(queries_results, bench_lbls, on=['query', 'tweet'], how='inner', suffixes=('_result', '_bench')) options['max_q_time'] = max(q_times) options['avg_q_time'] = sum(q_times) / len(q_times) options['MAP'] = metrics.map(q_results_labeled) options['precision'] = metrics.precision(q_results_labeled) options['precision@5'] = metrics.precision( q_results_labeled.groupby('query').head(5)) options['precision@10'] = metrics.precision( q_results_labeled.groupby('query').head(10)) options['precision@50'] = metrics.precision( q_results_labeled.groupby('query').head(50)) options['recall'] = metrics.recall(q_results_labeled, q2n_relevant) save_to_csv(options)
def _run_evaluation_on_selected_users(self, recommender_object, usersToEvaluate): start_time = time.time() start_time_print = time.time() results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.URM_train, self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) n_users_evaluated = 0 for test_user in usersToEvaluate: # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(test_user) n_users_evaluated += 1 recommended_items = recommender_object.recommend( test_user, remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN. value] += recall_min_test_len( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MAP.value] += map( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MRR.value] += rr( is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER.value].add_recommendations( recommended_items_current_cutoff, test_user) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) if time.time() - start_time_print > 30 or n_users_evaluated == len( self.usersToEvaluate): print( "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}" .format( n_users_evaluated, 100.0 * float(n_users_evaluated) / len(self.usersToEvaluate), time.time() - start_time, float(n_users_evaluated) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() return results_dict, n_users_evaluated
bench_lbls, on=['query', 'tweet'], how='inner', suffixes=('_result', '_bench')) # q_results_labeled.rename(columns={'y_true': 'label'}) zero_recall_qs = [q_id for q_id, rel in q2n_relevant.items() \ if metrics.recall_single(q_results_labeled, rel, q_id) == 0] if len(zero_recall_qs) > 0: logging.warning( f"{engine_module}'s recall for the following queries was zero {zero_recall_qs}." ) if q_results_labeled is not None: # test that MAP > 0 results_map = metrics.map(q_results_labeled) logging.debug( f"{engine_module} results have MAP value of {results_map}." ) if results_map <= 0 or results_map > 1: logging.error( f'{engine_module} results MAP value is out of range (0,1).' ) # test that the average across queries of precision, # precision@5, precision@10, precision@50, and recall # is in [0,1]. prec, p5, p10, p50, recall = \ metrics.precision(q_results_labeled), \ metrics.precision(q_results_labeled.groupby('query').head(5)), \ metrics.precision(q_results_labeled.groupby('query').head(10)), \
relevant_items = test[test_user].indices if len(relevant_items) > 0: neval += 1 # # TODO: Here you can write to file the recommendations for each user in the test split. # WARNING: there is a catch with the item idx! # # this will rank *all* items recommended_items = recommender.recommend(user_profile, exclude_seen=True) # use this to have the *top-k* recommended items (warning: this can underestimate ROC-AUC for small k) # recommended_items = recommender.recommend(user_profile, k=at, exclude_seen=True) roc_auc_ += roc_auc(recommended_items, relevant_items) precision_ += precision(recommended_items, relevant_items, at=at) recall_ += recall(recommended_items, relevant_items, at=at) map_ += map(recommended_items, relevant_items, at=at) mrr_ += rr(recommended_items, relevant_items, at=at) ndcg_ += ndcg(recommended_items, relevant_items, relevance=test[test_user].data, at=at) roc_auc_ /= neval precision_ /= neval recall_ /= neval map_ /= neval mrr_ /= neval ndcg_ /= neval logger.info('Ranking quality') logger.info('ROC-AUC: {:.4f}'.format(roc_auc_)) logger.info('Precision@{}: {:.4f}'.format(at, precision_))
def stats_cv(path=".", format="trec", prefix="svm", th=50, verbose=False): mrrs_se = [] mrrs_svm = [] abs_mrrs = [] rel_mrrs = [] maps_se = [] maps_svm = [] abs_maps = [] rel_maps = [] recalls1_se = [] recalls1_svm = [] abs_recalls = [] rel_recalls = [] oracle_mrrs = [] oracle_maps = [] oracle_recs1 = [] num_folds = 0 print "%13s %5s %7s %7s" %("IR", "SVM", "(abs)", "(rel)") for fold in sorted(os.listdir(path)): currentFold = os.path.join(path, fold) if not os.path.isdir(currentFold): continue if not fold.startswith("fold"): logging.warn("Directories containing CV folds should start with 'fold'") continue print fold # Relevancy file res_fname = os.path.join(currentFold, "%s.test.res" % prefix) if not os.path.exists(res_fname): logging.error("Relevancy file not found: %s", res_fname) sys.exit(1) # Predictions file pred_fname = os.path.join(currentFold, "%s.pred" % prefix) if not os.path.exists(pred_fname): logging.error("SVM prediction file not found: %s", pred_fname) sys.exit(1) try: ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose) except: logging.error("Failed to process input files: %s %s", res_fname, pred_fname) logging.error("Check that the input file format is correct") sys.exit(1) # MRR mrr_se = metrics.mrr(ir, th) or 1 mrr_svm = metrics.mrr(svm, th) mrrs_se.append(mrr_se) mrrs_svm.append(mrr_svm) # improvement abs_mrr_diff = mrr_svm - mrr_se rel_mrr_diff = (mrr_svm - mrr_se)*100/mrr_se abs_mrrs.append(abs_mrr_diff) rel_mrrs.append(rel_mrr_diff) print "MRR: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (mrr_se, mrr_svm, abs_mrr_diff, rel_mrr_diff) # MAP map_se = metrics.map(ir) or 1 map_svm = metrics.map(svm) maps_se.append(map_se) maps_svm.append(map_svm) # improvement abs_map_diff = map_svm - map_se rel_map_diff = (map_svm - map_se)*100/map_se abs_maps.append(abs_map_diff) rel_maps.append(rel_map_diff) print "MAP: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (map_se, map_svm, abs_map_diff, rel_map_diff) # Recall-of-1@1 rec_se = metrics.recall_of_1(ir, th)[0] or 1 rec_svm = metrics.recall_of_1(svm, th)[0] recalls1_se.append(rec_se) recalls1_svm.append(rec_svm) # improvement abs_rec_diff = rec_svm - rec_se rel_rec_diff = (rec_svm - rec_se)*100/rec_se abs_recalls.append(abs_rec_diff) rel_recalls.append(rel_rec_diff) print "P@1: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (rec_se, rec_svm, abs_rec_diff, rel_rec_diff) num_folds += 1 ''' mrr_oracle = metrics.oracle_mrr(ir, th) map_oracle = metrics.oracle_map(ir) prec_oracle = metrics.oracle_precision(ir, th)[0] rec1_oracle = metrics.oracle_recall_of_1(ir, th)[0] oracle_mrrs.append(mrr_oracle) oracle_maps.append(map_oracle) oracle_recs1.append(rec1_oracle) print "Oracle MRR: %5.2f, Oracle MAP: %5.2f, Oracle prec: %5.2f, Oracle rec@1: %5.2f" % (mrr_oracle, map_oracle, prec_oracle, rec1_oracle) ''' # mrrs avg_mrr_se, std_mrr_se = mean_and_std(mrrs_se) avg_mrr_svm, std_mrr_svm = mean_and_std(mrrs_svm) avg_abs_impr_mrr, std_abs_impr_mrr = mean_and_std(abs_mrrs) avg_rel_impr_mrr, std_rel_impr_mrr = mean_and_std(rel_mrrs) #oracle_avg_mrr, std_oracle_avg_mrr = mean_and_std(oracle_mrrs) # maps avg_map_se, std_map_se = mean_and_std(maps_se) avg_map_svm, std_map_svm = mean_and_std(maps_svm) avg_abs_impr_map, std_abs_impr_map = mean_and_std(abs_maps) avg_rel_impr_map, std_rel_impr_map = mean_and_std(rel_maps) #oracle_avg_map, std_oracle_avg_map = mean_and_std(oracle_maps) # recall avg_rec1_se, std_rec1_se = mean_and_std(recalls1_se) # se avg_rec1_svm, std_rec1_svm = mean_and_std(recalls1_svm) # svm avg_abs_impr_rec1, std_abs_impr_rec1 = mean_and_std(abs_recalls) # absolute avg_rel_impr_rec1, std_rel_impr_rec1 = mean_and_std(rel_recalls) # relative #oracle_avg_rec1, std_oracle_avg_rec1 = mean_and_std(oracle_recs1) FMT = u"%3s: %5.2f \u00B1 %4.2f %5.2f \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f" #ORACLE_FMT = u"Oracle MRR: %5.2f \u00B1 %4.2f, Oracle MAP: %5.2f \u00B1 %4.2f, Oracle P@1: %5.2f \u00B1 %4.2f" print print "Averaged over %s folds" % num_folds print "%17s %12s %14s %14s" %("IR", "SVM", "(abs)", "(rel)") print FMT % ("MRR", avg_mrr_se, std_mrr_se, avg_mrr_svm, std_mrr_svm, avg_abs_impr_mrr, std_abs_impr_mrr, avg_rel_impr_mrr, std_rel_impr_mrr) print FMT % ("MAP", avg_map_se, std_map_se, avg_map_svm, std_map_svm, avg_abs_impr_map, std_abs_impr_map, avg_rel_impr_map, std_rel_impr_map) print FMT % ("P@1", avg_rec1_se, std_rec1_se, avg_rec1_svm, std_rec1_svm, avg_abs_impr_rec1, std_abs_impr_rec1, avg_rel_impr_rec1, std_rel_impr_rec1)
def stats_cv(path=".", format="trec", prefix="svm", th=50, suf="", verbose=False, truth_file=None, ignore_noanswer=False, cut_truth_map_at_N=None): mrrs_se = [] mrrs_svm = [] abs_mrrs = [] rel_mrrs = [] maps_se = [] maps_svm = [] abs_maps = [] rel_maps = [] recalls1_se = [] recalls1_svm = [] abs_recalls = [] rel_recalls = [] num_folds = 0 truth = read_truth_file(truth_file, format, cut_truth_map_at_N) print "%13s %5s %7s %7s" % ("IR", "SVM", "(abs)", "(rel)") for fold in sorted(os.listdir(path)): currentFold = os.path.join(path, fold) if not os.path.isdir(currentFold): continue if not fold.startswith("fold"): logging.warn( "Directories containing CV folds should start with 'fold'") continue print fold # Relevancy file res_fname = os.path.join(currentFold, "%s.relevancy" % prefix) if not os.path.exists(res_fname): logging.error("Relevancy file not found: %s", res_fname) sys.exit(1) # Predictions file pred_fname = os.path.join(currentFold, "%s.pred" % (prefix + suf)) if not os.path.exists(pred_fname): logging.error("SVM prediction file not found: %s", pred_fname) sys.exit(1) try: ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, ignore_noanswer=ignore_noanswer, truth_map=truth) except: logging.error("Failed to process input files: %s %s", res_fname, pred_fname) logging.error("Check that the input file format is correct") sys.exit(1) # MRR mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) mrrs_se.append(mrr_se) mrrs_svm.append(mrr_svm) # improvement abs_mrr_diff = mrr_svm - mrr_se rel_mrr_diff = (mrr_svm - mrr_se) * 100 / mrr_se abs_mrrs.append(abs_mrr_diff) rel_mrrs.append(rel_mrr_diff) print "MRR: %5.2f %5.2f %+6.2f%% %+6.2f%%" % ( mrr_se, mrr_svm, abs_mrr_diff, rel_mrr_diff) # MAP map_se = metrics.map(ir) map_svm = metrics.map(svm) maps_se.append(map_se) maps_svm.append(map_svm) # improvement abs_map_diff = map_svm - map_se rel_map_diff = (map_svm - map_se) * 100 / map_se abs_maps.append(abs_map_diff) rel_maps.append(rel_map_diff) print "MAP: %5.2f %5.2f %+6.2f%% %+6.2f%%" % ( map_se * 100, map_svm * 100, abs_map_diff, rel_map_diff) # Recall-of-1@1 rec_se = metrics.recall_of_1(ir, th)[0] rec_svm = metrics.recall_of_1(svm, th)[0] recalls1_se.append(rec_se) recalls1_svm.append(rec_svm) # improvement abs_rec_diff = rec_svm - rec_se rel_rec_diff = (rec_svm - rec_se) * 100 / rec_se abs_recalls.append(abs_rec_diff) rel_recalls.append(rel_rec_diff) print "P@1: %5.2f %5.2f %+6.2f%% %+6.2f%%" % ( rec_se, rec_svm, abs_rec_diff, rel_rec_diff) num_folds += 1 # mrrs avg_mrr_se, std_mrr_se = mean_and_std(mrrs_se) avg_mrr_svm, std_mrr_svm = mean_and_std(mrrs_svm) avg_abs_impr_mrr, std_abs_impr_mrr = mean_and_std(abs_mrrs) avg_rel_impr_mrr, std_rel_impr_mrr = mean_and_std(rel_mrrs) # maps avg_map_se, std_map_se = mean_and_std(maps_se) avg_map_svm, std_map_svm = mean_and_std(maps_svm) avg_abs_impr_map, std_abs_impr_map = mean_and_std(abs_maps) avg_rel_impr_map, std_rel_impr_map = mean_and_std(rel_maps) # recall avg_rec1_se, std_rec1_se = mean_and_std(recalls1_se) # se avg_rec1_svm, std_rec1_svm = mean_and_std(recalls1_svm) # svm avg_abs_impr_rec1, std_abs_impr_rec1 = mean_and_std( abs_recalls) # absolute avg_rel_impr_rec1, std_rel_impr_rec1 = mean_and_std( rel_recalls) # relative FMT = u"%3s: %5.2f \u00B1 %4.2f %5.2f \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f" print print "Averaged over %s folds" % num_folds print "%17s %12s %14s %14s" % ("IR", "SVM", "(abs)", "(rel)") print FMT % ("MRR", avg_mrr_se, std_mrr_se, avg_mrr_svm, std_mrr_svm, avg_abs_impr_mrr, std_abs_impr_mrr, avg_rel_impr_mrr, std_rel_impr_mrr) print FMT % ("MAP", avg_map_se * 100, std_map_se, avg_map_svm * 100, std_map_svm, avg_abs_impr_map, std_abs_impr_map, avg_rel_impr_map, std_rel_impr_map) print FMT % ("P@1", avg_rec1_se, std_rec1_se, avg_rec1_svm, std_rec1_svm, avg_abs_impr_rec1, std_abs_impr_rec1, avg_rel_impr_rec1, std_rel_impr_rec1) print "Table view" print " MRR MAP P@1" print u"IR %5.2f\u00B1%4.2f %5.2f\u00B1%4.2f %5.2f\u00B1%4.2f" % ( avg_mrr_se, std_mrr_se, avg_map_se * 100, std_map_se * 100, avg_rec1_se, std_rec1_se) print u"SVM %5.2f\u00B1%4.2f %5.2f\u00B1%4.2f %5.2f\u00B1%4.2f" % ( avg_mrr_svm, std_mrr_svm, avg_map_svm * 100, std_map_svm * 100, avg_rec1_svm, std_rec1_svm)
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * ( conf_matrix['true']['true'] + conf_matrix['false']['false']) / ( conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print("") print("*** Official score (MAP for SYS): %5.4f" % (map_svm)) print("") print("") print("******************************") print("*** Classification results ***") print("******************************") print("") print("Acc = %5.4f" % (acc)) print("P = %5.4f" % (p)) print("R = %5.4f" % (r)) print("F1 = %5.4f" % (f1)) print("") print("") print("********************************") print("*** Detailed ranking results ***") print("********************************") print("") print("IR -- Score for the output of the IR system (baseline).") print("SYS -- Score for the output of the tested system.") print("") print("%13s %5s" % ("IR", "SYS")) print("MAP : %5.4f %5.4f" % (map_se, map_svm)) print("AvgRec: %5.4f %5.4f" % (avg_acc1_ir, avg_acc1_svm)) print("MRR : %6.2f %6.2f" % (mrr_se, mrr_svm)) print("%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SYS", "IR", "SYS", "IR", "SYS", "IR", "SYS")) for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate( zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): print( "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" % (i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2)) print() print( "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks where questions have at most one correct answer)" ) print( "ACC - accuracy, i.e., number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" ) print( "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" ) print("AC2 - the absolute number of correct answers at @X")
def _run_evaluation_on_selected_users(self, recommender_object, usersToEvaluate, block_size=1000): start_time = time.time() start_time_print = time.time() results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.get_URM_train(), self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) n_users_evaluated = 0 # Start from -block_size to ensure it to be 0 at the first block user_batch_start = 0 user_batch_end = 0 while user_batch_start < len(self.usersToEvaluate): user_batch_end = user_batch_start + block_size user_batch_end = min(user_batch_end, len(usersToEvaluate)) test_user_batch_array = np.array( usersToEvaluate[user_batch_start:user_batch_end]) user_batch_start = user_batch_end # Compute predictions for a batch of users using vectorization, much more efficient than computing it one at a time recommended_items_batch_list = recommender_object.recommend( test_user_batch_array, remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) # Compute recommendation quality for each user in batch for batch_user_index in range(len(recommended_items_batch_list)): user_id = test_user_batch_array[batch_user_index] recommended_items = recommended_items_batch_list[ batch_user_index] # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(user_id) is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) n_users_evaluated += 1 for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[ 0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN. value] += recall_min_test_len( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MAP.value] += map( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MRR.value] += rr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(user_id), at=cutoff) results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() results_current_cutoff[ EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER. value].add_recommendations( recommended_items_current_cutoff, user_id) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) if time.time( ) - start_time_print > 30 or n_users_evaluated == len( self.usersToEvaluate): print( "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}" .format( n_users_evaluated, 100.0 * float(n_users_evaluated) / len(self.usersToEvaluate), time.time() - start_time, float(n_users_evaluated) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() return results_dict, n_users_evaluated
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * ( conf_matrix['true']['true'] + conf_matrix['false']['false']) / ( conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) #print "" #print "*** Official score (MAP for SYS): %5.4f" %(map_svm) #print "" #print "" #print "******************************" #print "*** Classification results ***" #print "******************************" #print "" #print "Acc = %5.4f" %(acc) #print "P = %5.4f" %(p) #print "R = %5.4f" %(r) #print "F1 = %5.4f" %(f1) #print "" #print "" #print "********************************" #print "*** Detailed ranking results ***" #print "********************************" #print "" #print "IR -- Score for the output of the IR system (baseline)." #print "SYS -- Score for the output of the tested system." #print "" #print "%13s %5s" %("IR", "SYS") #print "MAP : %5.4f %5.4f" %(map_se, map_svm) #print "AvgRec: %5.4f %5.4f" %(avg_acc1_ir, avg_acc1_svm) #print "MRR : %6.2f %6.2f" %(mrr_se, mrr_svm) print "MAP : %5.4f\tMRR : %5.4f\tAvgRec: %5.4f" % (map_svm, mrr_svm, avg_acc1_svm) #print "Acc : %5.4f" %(acc) #print "P : %5.4f" %(p) #print "R : %5.4f" %(r) #print "F1 : %5.4f" %(f1) """
def evaluateRecommender(self, recommender_object): """ :param recommender_object: the trained recommender object, a Recommender subclass :param URM_test_list: list of URMs to test the recommender against, or a single URM object :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff """ results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.URM_train, self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) start_time = time.time() start_time_print = time.time() n_eval = 0 self.__all_items = np.arange(0, self.n_items, dtype=np.int) self.__all_items = set(self.__all_items) if self.ignore_items_flag: recommender_object.set_items_to_ignore(self.ignore_items_ID) for test_user in self.usersToEvaluate: # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(test_user) n_eval += 1 self.user_specific_remove_items(recommender_object, test_user) # recommended_items = recommender_object.recommend(np.array(test_user), remove_seen_flag=self.exclude_seen, # cutoff = self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) recommended_items = recommender_object.recommend( np.atleast_1d(test_user), remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) recommended_items = np.array(recommended_items[0]) recommender_object.reset_items_to_ignore() is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN. value] += recall_min_test_len( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MAP.value] += map( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MRR.value] += rr( is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER.value].add_recommendations( recommended_items_current_cutoff, test_user) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) if time.time() - start_time_print > 30 or n_eval == len( self.usersToEvaluate): print( "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}" .format(n_eval, 100.0 * float(n_eval) / len(self.usersToEvaluate), time.time() - start_time, float(n_eval) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() if (n_eval > 0): for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] for key in results_current_cutoff.keys(): value = results_current_cutoff[key] if isinstance(value, Metrics_Object): results_current_cutoff[key] = value.get_metric_value() else: results_current_cutoff[key] = value / n_eval precision_ = results_current_cutoff[ EvaluatorMetrics.PRECISION.value] recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value] if precision_ + recall_ != 0: results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * ( precision_ * recall_) / (precision_ + recall_) else: print( "WARNING: No users had a sufficient number of relevant items") if self.ignore_items_flag: recommender_object.reset_items_to_ignore() results_run_string = self.get_result_string(results_dict) return (results_dict, results_run_string)
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * (conf_matrix['true']['true'] + conf_matrix['false']['false']) / (conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print ("") print ("*** Official score (MAP for SYS): %5.4f" %(map_svm)) print ("") print ("") print( "******************************") print( "*** Classification results ***") print( "******************************") print( "") print( "Acc = %5.4f" %(acc)) print( "P = %5.4f" %(p)) print( "R = %5.4f" %(r)) print( "F1 = %5.4f" %(f1)) print( "") print( "") print( "********************************") print( "*** Detailed ranking results ***") print( "********************************") print( "") print( "IR -- Score for the output of the IR system (baseline).") print( "SYS -- Score for the output of the tested system.") print( "") print( "%13s %5s" %("IR", "SYS")) print( "MAP : %5.4f %5.4f" %(map_se, map_svm)) print( "AvgRec: %5.4f %5.4f" %(avg_acc1_ir, avg_acc1_svm)) print( "MRR : %6.2f %6.2f" %(mrr_se, mrr_svm)) print( "%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SYS", "IR", "SYS", "IR", "SYS", "IR", "SYS")) for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): print( "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2)) print( "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks where questions have at most one correct answer)") print( "ACC - accuracy, i.e., number of correct answers retrieved at rank @X normalized by the rank and the total number of questions") print( "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)") print( "AC2 - the absolute number of correct answers at @X") return map_svm
def calcQueryEvaluation(q_res, num_q): queries_results = pd.DataFrame(q_res, columns=['query', 'tweet']) if bench_lbls is not None and len(queries_results) > 0: q_results_labeled = pd.merge(queries_results, bench_lbls, on=['query', 'tweet'], how='inner', suffixes=('_result', '_bench')) # q_results_labeled.rename(columns={'y_true': 'label'}) zero_recall_qs = [q_id for q_id, rel in q2n_relevant.items() \ if metrics.recall_single(q_results_labeled, rel, q_id) == 0] if len(zero_recall_qs) > 0: logging.warning( f"{engine_module}'s recall for the following queries was zero {zero_recall_qs}." ) if q_results_labeled is not None: # test that MAP > 0 results_map = metrics.map(q_results_labeled) logging.debug( f"{engine_module} results have MAP value of {results_map}." ) if results_map <= 0 or results_map > 1: logging.error( f'{engine_module} results MAP value is out of range (0,1).' ) prec, p5, p10, p50, recall = \ metrics.precision(q_results_labeled), \ metrics.precision(q_results_labeled.groupby('query').head(5)), \ metrics.precision(q_results_labeled.groupby('query').head(10)), \ metrics.precision(q_results_labeled.groupby('query').head(50)), \ metrics.recall_single(q_results_labeled, q2n_relevant[num_q], num_q) # logging.debug(f"{engine_module} on query {num_q} results produced average precision of {prec}.") # logging.debug(f"{engine_module} on query {num_q} results produced average precision@5 of {p5}.") # logging.debug(f"{engine_module} on query {num_q} results produced average precision@10 of {p10}.") # logging.debug(f"{engine_module} on query {num_q} results produced average precision@50 of {p50}.") # logging.debug(f"{engine_module} on query {num_q} results produced average recall of {recall}.") if prec < 0 or prec > 1: logging.error( f"The average precision for {engine_module} is out of range [0,1]." ) if p5 < 0 or p5 > 1: logging.error( f"The average precision@5 for {engine_module} is out of range [0,1]." ) if p5 < 0 or p5 > 1: logging.error( f"The average precision@5 for {engine_module} is out of range [0,1]." ) if p50 < 0 or p50 > 1: logging.error( f"The average precision@50 for {engine_module} is out of range [0,1]." ) if recall < 0 or recall > 1: logging.error( f"The average recall for {engine_module} is out of range [0,1]." ) precision_list.append(prec) precisiion_5.append(p5) precisiom_10.append(p10) presicion_50.append(p50) recall_list.append(recall)