def generate_reward(gold_index_list, answer_index_list): reward = 0 ap = 0 reciprocal_rank = 0 answer_list = list(answer_index_list) size = len(answer_index_list) true = sum(gold_index_list > 0) inp = np.zeros(size) for rank, val in enumerate(gold_index_list): if val and rank in answer_list: inp[answer_list.index(rank)] = val maxk = sum(inp > 0) if true: ap = average_precision(inp) * (maxk / true) reciprocal_rank = mean_reciprocal_rank([inp]) ndcg = ndcg_at_k(inp, min(10, size)) dcg_five = dcg_at_k(inp, 5) reward = (ap + reciprocal_rank + ndcg + dcg_five) / 4 ranks = [1, 3, 5, 10] reward_tuple = [reward, ap, reciprocal_rank, ndcg, dcg_five] for r in ranks: reward_tuple.append(precision_at_k(inp, min(r, len(inp)))) for r in ranks: reward_tuple.append(ndcg_at_k(inp, min(r, len(inp)))) return reward_tuple
def evaluate(model, data, logdir, epoch, out_f, gpu): get_embedding(model, data, logdir, gpu, test=False) img_embeddings, img_fns, gel_embeddings, gel_fns = get_embedding(model, data, logdir, gpu, test=True) precision = get_score(img_embeddings, img_fns, gel_embeddings, gel_fns) return precision nb_img = len(img_embeddings) nb_gel = len(gel_embeddings) distance_matrix = np.zeros((nb_gel, nb_img)) img_embeddings = np.array(img_embeddings) gel_embeddings = np.array(gel_embeddings) dim_embedding = img_embeddings.shape[-1] img_embeddings = img_embeddings.reshape((nb_img, dim_embedding)) gel_embeddings = gel_embeddings.reshape((nb_gel, dim_embedding)) scores = [] for i in range(nb_gel): distance_matrix[i, :] = np.mean(np.square(img_embeddings - gel_embeddings[i, :]), axis=1).T r = [] for j in range(nb_img): if (get_gel_id(img_fns[j]) == get_gel_id(gel_fns[i])): r.append(1) else: r.append(0) d = distance_matrix[i, :].tolist() a = zip(d, r) a = sorted(a, key=lambda d: d[0]) r = [x[1] for x in a] ndcg = [rank_metrics.ndcg_at_k(r, k) for k in [10, 20, 30]] precision = [rank_metrics.precision_at_k(r, k) for k in [10, 20, 30]] scores.append(ndcg + precision) scores = np.array(scores) scores = np.mean(scores, axis=0) print "ndcg & precision", scores print >> out_f, "ndcg & precision", scores
def evaluate_retrieval(query_dct, corpus_dct, inverted_index, method_type): ''' Given a query dictionary and a corpus dictionary, go through each query and determine the NDCG for its retrieval with the disease labels as relevance measures. ''' metric_dct = {} for query_key in query_dct: doc_score_dct = {} q_disease_list, q_symptom_list, q_herb_list = query_dct[query_key] for doc_key in corpus_dct: d_disease_list, d_symptom_list, d_herb_list = corpus_dct[doc_key] # With no query expansion, our document is just the set of symptoms. document = d_symptom_list[:] if 'mixed' in method_type or 'synonym' in method_type: document += d_herb_list # If expanded, q_symptom list might also contain herbs. doc_score = okapi_bm25(q_symptom_list, document, inverted_index, len(corpus_dct)) # Compute the relevance judgement. relevance = get_rel_score(q_disease_list, d_disease_list) doc_score_dct[(doc_key, relevance)] = doc_score sorted_scores = sorted(doc_score_dct.items(), key=operator.itemgetter(1), reverse=True) # Get the relevance rankings. rel_list = [pair[0][1] for pair in sorted_scores] # Compute different rank metrics for different values of k. for k in k_list: if k not in metric_dct: metric_dct[k] = [] if rank_metric == 'ndcg': metric_dct[k] += [ndcg_at_k(rel_list, k)] elif rank_metric == 'precision': metric_dct[k] += [precision_at_k(rel_list, k)] return metric_dct
def summarize(self): """Give summary statistics about the tournament.""" res = self.run() # res = self.results # champ should be undefeated champ = list(np.where(res.strength == max(res.strength))[0]) copeland = (res.wins[champ] == self.n_rounds) # top-k ranks = pd.DataFrame(data=np.transpose([ res.strength.rank(ascending=False), res.wins.rank(ascending=False), res.wins ]), columns=["str_rank", "win_rank", "wins"]) ranks['relevant'] = ranks['str_rank'] <= self.k borda = (ranks.win_rank[champ] == ranks.win_rank.min()) top_k_df = ranks.loc[ranks['str_rank'] <= self.k] top_k = sum(top_k_df['wins'] >= self.n_rounds - 2) / self.k tau, k_p = scipy.stats.kendalltau(ranks.str_rank, ranks.win_rank) rho, sp_p = scipy.stats.spearmanr(ranks.str_rank, ranks.win_rank) ranks.sort_values(by="win_rank") # using rank_metrics rel_vec = ranks.relevant.values prec = rank_metrics.r_precision(rel_vec) prec_at_k = rank_metrics.precision_at_k(rel_vec, self.k) avg_prec = rank_metrics.average_precision(rel_vec) dcg = rank_metrics.dcg_at_k(rel_vec, self.k) ndcg = rank_metrics.ndcg_at_k(rel_vec, self.k) df = pd.DataFrame(data=[ list([ int(copeland), int(borda), float(top_k), prec, prec_at_k, avg_prec, dcg, ndcg, float(tau), float(rho) ]) ], columns=[ 'undef_champ', 'top_champ', 'top_k_found', 'precision', 'precision_at_k', 'avg_prec', 'dcg', 'ndcg', 'tau', 'rho' ]) return df
def compute_metrics(ranked_judgements, pr_atk, threshold_grade): """ Given the ranked judgements compute the metrics for a query. :param ranked_judgements: list(int); graded or binary relevances in rank order. :param pr_atk: int; the @K value to use for computing precision and recall. :param threshold_grade: int; Assuming 0-3 graded relevances, threshold at some point and convert graded to binary relevance. :return: """ graded_judgements = ranked_judgements ranked_judgements = [ 1 if rel >= threshold_grade else 0 for rel in graded_judgements ] # Use the full set of candidate not the pr_atk. ndcg = rm.ndcg_at_k(graded_judgements, len(ranked_judgements)) ndcg_pr = rm.ndcg_at_k(graded_judgements, int(0.20 * len(ranked_judgements))) ndcg_20 = rm.ndcg_at_k(graded_judgements, 20) max_total_relevant = sum(ranked_judgements) recall = recall_at_k(ranked_rel=ranked_judgements, atk=pr_atk, max_total_relevant=max_total_relevant) precision = rm.precision_at_k(r=ranked_judgements, k=pr_atk) r_precision = rm.r_precision(r=ranked_judgements) f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 av_precision = rm.average_precision(r=ranked_judgements) reciprocal_rank = rm.mean_reciprocal_rank(rs=[ranked_judgements]) metrics = { 'recall': float(recall), 'precision': float(precision), 'f1': float(f1), 'r_precision': float(r_precision), 'av_precision': float(av_precision), 'reciprocal_rank': float(reciprocal_rank), 'ndcg': ndcg, 'ndcg@20': ndcg_20, 'ndcg%20': ndcg_pr } return metrics
# In[36]: import numpy as np import rank_metrics import sys relevanceVector = np.loadtxt(open(sys.argv[1] + "/rv/relevanceVector_" + sys.argv[2]), delimiter=" ") f = open(sys.argv[1] + '/em/evalMetrics_' + sys.argv[2], 'w') for k in range(1, 16): total_precision_k = 0 total_dcg_k = 0 total_ndcg_k = 0 for row in relevanceVector: precision_k = rank_metrics.precision_at_k(row, k) dcg_k = rank_metrics.dcg_at_k(row, k, 0) ndcg_k = rank_metrics.ndcg_at_k(row, k, 0) total_precision_k = total_precision_k + precision_k total_dcg_k = total_dcg_k + dcg_k total_ndcg_k = total_ndcg_k + ndcg_k f.write("precision@" + str(k) + ": " + str(total_precision_k) + "\n") f.write("dcg@" + str(k) + ": " + str(total_dcg_k) + "\n") f.write("ndcg@" + str(k) + ": " + str(total_ndcg_k) + "\n") mrr = rank_metrics.mean_reciprocal_rank(relevanceVector) f.write("Mean Reciprocal Rank: " + str(mrr) + "\n") maP = rank_metrics.mean_average_precision(relevanceVector) f.write("Mean Average Precision: " + str(maP) + "\n") f.close()
def evaluate_retrieval(query_dct, corpus_dct): ''' Given a query dictionary and a corpus dictionary, go through each query and determine the NDCG for its retrieval with the disease labels as relevance measures. ''' # Map each symptom and herb to the number of patient visits it appears in. inverted_index, avg_doc_len = get_inverted_index(corpus_dct) corpus_size = len(corpus_dct) metric_dct = {} for query_key in query_dct: doc_score_dct = {} # Ignore the query herb set. q_disease is label, q_symptom is query. q_disease_set, q_symptom_set, q_herb_set = query_dct[query_key] for doc_key in corpus_dct: d_disease_set, d_symptom_set, d_herb_set = corpus_dct[doc_key] # With no query expansion, our document is just the set of symptoms. document = d_symptom_set # If synonym or herbs/mixed expansions, add herb list into document. if args.method == 'synonym' or args.term_type in [ 'herbs', 'mixed' ]: document = document.union(d_herb_set) # Get the score between the query and the document. doc_score = okapi_bm25(q_symptom_set, document, inverted_index, corpus_size, avg_doc_len) # Compute the relevance judgement. relevance = get_rel_score(q_disease_set, d_disease_set) doc_score_dct[(doc_key, relevance)] = doc_score sorted_scores = sorted(doc_score_dct.items(), key=operator.itemgetter(1), reverse=True) # Get the relevance rankings. rel_list = [pair[0][1] for pair in sorted_scores] # Compute different rank metrics for different values of k. for k in k_list: if k not in metric_dct: metric_dct[k] = [] if args.rank_metric == 'ndcg': metric_dct[k] += [ndcg_at_k(rel_list, k)] elif args.rank_metric == 'precision': # metric_dct[k] += [precision_at_k(rel_list, k)] metric_dct[k] += [sum(rel_list[:k]) / float(k)] elif args.rank_metric == 'recall': metric_dct[k] += [sum(rel_list[:k]) / float(sum(rel_list))] elif args.rank_metric == 'f1': precision = sum(rel_list[:k]) / float(k) recall = sum(rel_list[:k]) / float(sum(rel_list)) if precision == 0: metric_dct[k] += [0] else: metric_dct[k] += [ 2 * precision * recall / (precision + recall) ] elif args.rank_metric == 'map': r = np.asarray(rel_list[:k]) != 0 out = [precision_at_k(r, i + 1) for i in range(r.size) if r[i]] if not out: metric_dct[k] += [0.0] else: metric_dct[k] += [sum(out) / sum(rel_list)] return metric_dct
def rew6(inp, ap, reciprocal_rank, ndcg, dcg_five): return (ap + precision_at_k(inp, 3) + precision_at_k(inp, 5) + ndcg_at_k(inp, 3) + ndcg_at_k(inp, 5)) / 5