def ndcg(self, hit_list, total_known_DTI): if total_known_DTI == 0: return float('nan') else: if total_known_DTI >= len(hit_list): ideal_list = [1 for number in range(len(hit_list))] else: ideal_list = [1 for number in range(total_known_DTI)] + [ 0 for number in range(len(hit_list) - total_known_DTI) ] return rank.dcg_at_k(hit_list, len(hit_list), 1) / rank.dcg_at_k( ideal_list, len(hit_list), 1)
def generate_reward(gold_index_list, answer_index_list): reward = 0 ap = 0 reciprocal_rank = 0 answer_list = list(answer_index_list) size = len(answer_index_list) true = sum(gold_index_list > 0) inp = np.zeros(size) for rank, val in enumerate(gold_index_list): if val and rank in answer_list: inp[answer_list.index(rank)] = val maxk = sum(inp > 0) if true: ap = average_precision(inp) * (maxk / true) reciprocal_rank = mean_reciprocal_rank([inp]) ndcg = ndcg_at_k(inp, min(10, size)) dcg_five = dcg_at_k(inp, 5) reward = (ap + reciprocal_rank + ndcg + dcg_five) / 4 ranks = [1, 3, 5, 10] reward_tuple = [reward, ap, reciprocal_rank, ndcg, dcg_five] for r in ranks: reward_tuple.append(precision_at_k(inp, min(r, len(inp)))) for r in ranks: reward_tuple.append(ndcg_at_k(inp, min(r, len(inp)))) return reward_tuple
def evaluate_results(qids_rs, Y, k): values = defaultdict(list) for qid, r in qids_rs: gold = harvest(Y, qid) gold_topk = gold[argtopk(gold, k)] R = np.count_nonzero(gold_topk) # real ndcg idcg = rm.dcg_at_k(gold_topk, k) ndcg = rm.dcg_at_k(r, k) / idcg values["ndcg"].append(ndcg) # Verified # MAP@k ap = rm.average_precision(r) values["MAP"].append(ap) # MRR - compute by hand ind = np.asarray(r).nonzero()[0] mrr = (1. / (ind[0] + 1)) if ind.size else 0. values["MRR"].append(mrr) # R precision # R = min(R, k) # ok lets be fair.. you cant get more than k # we dont need that anymore, since we chop of the remainder # before computing R recall = rm.recall(r, R) values["recall"].append(recall) # precision = rm.precision_at_k(pad(scored_result, k), k) precision = rm.precision(r) values["precision"].append(precision) f1 = f1_score(precision, recall) values["f1_score"].append(f1) # Safe variant does not fail if len(r) < k p_at_5 = rm.safe_precision_at_k(r, 5) values["precision@5"].append(p_at_5) p_at_10 = rm.safe_precision_at_k(r, 10) values["precision@10"].append(p_at_10) return values
def evaluate_results(qids_rs, Y, k): values = defaultdict(list) for qid, r in qids_rs: gold = harvest(Y, qid) gold_topk = gold[argtopk(gold, k)] R = np.count_nonzero(gold_topk) # real ndcg idcg = rm.dcg_at_k(gold_topk, k) ndcg = rm.dcg_at_k(r, k) / idcg values["ndcg"].append(ndcg) # Verified # MAP@k ap = rm.average_precision(r) values["MAP"].append(ap) # MRR - compute by hand ind = np.asarray(r).nonzero()[0] mrr = (1. / (ind[0] + 1)) if ind.size else 0. values["MRR"].append(mrr) # R precision # R = min(R, k) # ok lets be fair.. you cant get more than k # we dont need that anymore, since we chop of the remainder # before computing R recall = rm.recall(r, R) values["recall"].append(recall) # precision = rm.precision_at_k(pad(scored_result, k), k) precision = rm.precision(r) values["precision"].append(precision) f1 = f1_score(precision, recall) values["f1_score"].append(f1) # Safe variant does not fail if len(r) < k p_at_5 = rm.safe_precision_at_k(r, 5) values["precision@5"].append(p_at_5) p_at_10 = rm.safe_precision_at_k(r, 10) values["precision@10"].append(p_at_10) return values
def generate_reward(gold_index_list, answer_index_list, reward_type): reward = 0 ap = 0. reciprocal_rank = 0 answer_list = list(answer_index_list) size = len(answer_index_list) true = sum(gold_index_list > 0) inp = np.zeros(size) for rank, val in enumerate(gold_index_list): if val and rank in answer_list: inp[answer_list.index(rank)] = val maxk = sum(inp > 0) if true: ap = average_precision(inp) * (maxk / true) reciprocal_rank = mean_reciprocal_rank([inp]) ndcg = ndcg_at_k(inp, min(10, size)) dcg_five = dcg_at_k(inp, 5) reward = rewards[reward_type - 1](inp, ap, reciprocal_rank, ndcg, dcg_five) return reward, ap, reciprocal_rank, ndcg, dcg_five
def summarize(self): """Give summary statistics about the tournament.""" res = self.run() # res = self.results # champ should be undefeated champ = list(np.where(res.strength == max(res.strength))[0]) copeland = (res.wins[champ] == self.n_rounds) # top-k ranks = pd.DataFrame(data=np.transpose([ res.strength.rank(ascending=False), res.wins.rank(ascending=False), res.wins ]), columns=["str_rank", "win_rank", "wins"]) ranks['relevant'] = ranks['str_rank'] <= self.k borda = (ranks.win_rank[champ] == ranks.win_rank.min()) top_k_df = ranks.loc[ranks['str_rank'] <= self.k] top_k = sum(top_k_df['wins'] >= self.n_rounds - 2) / self.k tau, k_p = scipy.stats.kendalltau(ranks.str_rank, ranks.win_rank) rho, sp_p = scipy.stats.spearmanr(ranks.str_rank, ranks.win_rank) ranks.sort_values(by="win_rank") # using rank_metrics rel_vec = ranks.relevant.values prec = rank_metrics.r_precision(rel_vec) prec_at_k = rank_metrics.precision_at_k(rel_vec, self.k) avg_prec = rank_metrics.average_precision(rel_vec) dcg = rank_metrics.dcg_at_k(rel_vec, self.k) ndcg = rank_metrics.ndcg_at_k(rel_vec, self.k) df = pd.DataFrame(data=[ list([ int(copeland), int(borda), float(top_k), prec, prec_at_k, avg_prec, dcg, ndcg, float(tau), float(rho) ]) ], columns=[ 'undef_champ', 'top_champ', 'top_k_found', 'precision', 'precision_at_k', 'avg_prec', 'dcg', 'ndcg', 'tau', 'rho' ]) return df
def generate_reward(gold_index_list, answer_index_list, reward_type=1): reward = 0 ap = 0 reciprocal_rank = 0 answer_list = list(deepcopy(answer_index_list)) size = len(answer_index_list) true = sum(gold_index_list) inp = np.zeros(size) for rank, val in enumerate(gold_index_list): if val and rank in answer_list: inp[answer_list.index(rank)] = 2 if true: ap = average_precision(inp) * (sum(inp > 0) / true) reciprocal_rank = mean_reciprocal_rank([inp]) #ndcg = ndcg_at_k(inp,size) #if reward_type==1: # reward = (ap+reciprocal_rank)/2 #elif reward_type ==2 : # reward = dcg_at_k(inp,size) rewards = [(ap + reciprocal_rank) / 2, dcg_at_k(inp, size)] return rewards[reward_type - 1], ap, reciprocal_rank, (inp[0] > 0)
# In[36]: import numpy as np import rank_metrics import sys relevanceVector = np.loadtxt(open(sys.argv[1] + "/rv/relevanceVector_" + sys.argv[2]), delimiter=" ") f = open(sys.argv[1] + '/em/evalMetrics_' + sys.argv[2], 'w') for k in range(1, 16): total_precision_k = 0 total_dcg_k = 0 total_ndcg_k = 0 for row in relevanceVector: precision_k = rank_metrics.precision_at_k(row, k) dcg_k = rank_metrics.dcg_at_k(row, k, 0) ndcg_k = rank_metrics.ndcg_at_k(row, k, 0) total_precision_k = total_precision_k + precision_k total_dcg_k = total_dcg_k + dcg_k total_ndcg_k = total_ndcg_k + ndcg_k f.write("precision@" + str(k) + ": " + str(total_precision_k) + "\n") f.write("dcg@" + str(k) + ": " + str(total_dcg_k) + "\n") f.write("ndcg@" + str(k) + ": " + str(total_ndcg_k) + "\n") mrr = rank_metrics.mean_reciprocal_rank(relevanceVector) f.write("Mean Reciprocal Rank: " + str(mrr) + "\n") maP = rank_metrics.mean_average_precision(relevanceVector) f.write("Mean Average Precision: " + str(maP) + "\n") f.close()
def test_dcg(self): r = [3, 2, 3, 0, 1, 2] self.assertEqual(dcg_at_k(r, 1, method=0), 3) self.assertAlmostEqual(dcg_at_k(r, 6, method=0), 6.861, places=3)
def test_two_dcg_for_binary_relevance(self): r = [1, 1, 0, 0, 1, 1, 0, 1] self.assertEqual(dcg_at_k(r, 5, method=0), dcg_at_k(r, 5, method=1)) self.assertEqual(dcg_at_k(r, 4, method=0), dcg_at_k(r, 4, method=1))
def evaluate(self, X, Y, k=20, verbose=0, replacement=0, n_jobs=1): """ :X: [(qid, str)] query id, query string pairs :Y: pandas dataseries with qid,docid index or [dict] :k: Limit the result for all metrics to this value, the models are also given a hint of how many they should return. :replacement: 0 means that (query, doc) pairs not prevalent in Y will not be considered relevant, None means that those are not considered (skipped). """ # rs = [] # if n_jobs > 1: # return process_and_evaluate(self, X, Y, k, n_jobs) values = defaultdict(list) for qid, query in X: # execute query if verbose > 0: print(qid, ":", query) t0 = timer() # if replacement is None, we need to drop after querying result = self.query(query, k=(None if replacement is None else k)) values["time_per_query"].append(timer() - t0) # if verbose > 0: # print(result[:k]) # result = result[:k] # TRIM HERE # soak the generator scored_result = [ harvest(Y, qid, docid, replacement) for docid in result ] if replacement is None: scored_result, notfound = filter_none(scored_result) values["gold_not_found"].append(notfound) if k is not None: # dont let the models cheat by returning more than k r = scored_result[:k] else: # if k is None, consider all r = scored_result # if verbose > 0: # print(r) # gold = np.array(list(Y[qid].values())) gold = harvest(Y, qid) import sys # print(gold, file=sys.stderr) topk_indices = argtopk(gold, k) print(topk_indices, file=sys.stderr) gold_topk = gold[topk_indices] # print('Top k in gold standard:', gold_topk, file=sys.stderr) R = np.count_nonzero(gold_topk) if verbose > 0: print("Retrieved {} relevant out of {} possible.".format( np.count_nonzero(r), R)) # real ndcg idcg = rm.dcg_at_k(gold_topk, k) ndcg = rm.dcg_at_k(scored_result, k) / idcg values["ndcg"].append(ndcg) # Verified # MAP@k ap = rm.average_precision(r) values["MAP"].append(ap) # MRR - compute by hand ind = np.asarray(r).nonzero()[0] mrr = (1. / (ind[0] + 1)) if ind.size else 0. values["MRR"].append(mrr) # R precision # R = min(R, k) # ok lets be fair.. you cant get more than k # we dont need that anymore, since we chop of the remainder # before computing R recall = rm.recall(r, R) values["recall"].append(recall) # precision = rm.precision_at_k(pad(scored_result, k), k) precision = rm.precision(r) values["precision"].append(precision) f1 = f1_score(precision, recall) values["f1_score"].append(f1) # Safe variant does not fail if len(r) < k p_at_5 = rm.safe_precision_at_k(r, 5) values["precision@5"].append(p_at_5) p_at_10 = rm.safe_precision_at_k(r, 10) values["precision@10"].append(p_at_10) # rs.append(r) if verbose > 0: # print("Precision: {:.4f}".format(precision)) # print("Recall: {:.4f}".format(recall)) # print("F1-Score: {:.4f}".format(f1)) print("AP: {:.4f}".format(ap)) print("RR: {:.4f}".format(mrr)) print("NDCG: {:.4f}".format(ndcg)) return values
def rew4(inp, ap, reciprocal_rank, ndcg, dcg_five): return (dcg_five + dcg_at_k(inp, 3) + dcg_at_k(inp, 1)) / 3
def evaluate(self, X, Y, k=20, verbose=0, replacement=0, n_jobs=1): """ :X: [(qid, str)] query id, query string pairs :Y: pandas dataseries with qid,docid index or [dict] :k: Limit the result for all metrics to this value, the models are also given a hint of how many they should return. :replacement: 0 means that (query, doc) pairs not prevalent in Y will not be considered relevant, None means that those are not considered (skipped). """ # rs = [] # if n_jobs > 1: # return process_and_evaluate(self, X, Y, k, n_jobs) values = defaultdict(list) for qid, query in X: # execute query if verbose > 0: print(qid, ":", query) t0 = timer() # if replacement is None, we need to drop after querying result = self.query(query, k=(None if replacement is None else k)) values["time_per_query"].append(timer() - t0) # if verbose > 0: # print(result[:k]) # result = result[:k] # TRIM HERE # soak the generator scored_result = [harvest(Y, qid, docid, replacement) for docid in result] if replacement is None: scored_result, notfound = filter_none(scored_result) values["gold_not_found"].append(notfound) if k is not None: # dont let the models cheat by returning more than k r = scored_result[:k] else: # if k is None, consider all r = scored_result # if verbose > 0: # print(r) # gold = np.array(list(Y[qid].values())) gold = harvest(Y, qid) import sys # print(gold, file=sys.stderr) topk_indices = argtopk(gold, k) print(topk_indices, file=sys.stderr) gold_topk = gold[topk_indices] # print('Top k in gold standard:', gold_topk, file=sys.stderr) R = np.count_nonzero(gold_topk) if verbose > 0: print("Retrieved {} relevant out of {} possible." .format(np.count_nonzero(r), R)) # real ndcg idcg = rm.dcg_at_k(gold_topk, k) ndcg = rm.dcg_at_k(scored_result, k) / idcg values["ndcg"].append(ndcg) # Verified # MAP@k ap = rm.average_precision(r) values["MAP"].append(ap) # MRR - compute by hand ind = np.asarray(r).nonzero()[0] mrr = (1. / (ind[0] + 1)) if ind.size else 0. values["MRR"].append(mrr) # R precision # R = min(R, k) # ok lets be fair.. you cant get more than k # we dont need that anymore, since we chop of the remainder # before computing R recall = rm.recall(r, R) values["recall"].append(recall) # precision = rm.precision_at_k(pad(scored_result, k), k) precision = rm.precision(r) values["precision"].append(precision) f1 = f1_score(precision, recall) values["f1_score"].append(f1) # Safe variant does not fail if len(r) < k p_at_5 = rm.safe_precision_at_k(r, 5) values["precision@5"].append(p_at_5) p_at_10 = rm.safe_precision_at_k(r, 10) values["precision@10"].append(p_at_10) # rs.append(r) if verbose > 0: # print("Precision: {:.4f}".format(precision)) # print("Recall: {:.4f}".format(recall)) # print("F1-Score: {:.4f}".format(f1)) print("AP: {:.4f}".format(ap)) print("RR: {:.4f}".format(mrr)) print("NDCG: {:.4f}".format(ndcg)) return values