def test_run_rbo(): with open('./example/rpd_b.txt') as _base_file, open( './example/rpd_a.txt') as _adv_file: _base_run = pytrec_eval.parse_run(_base_file) _adv_run = pytrec_eval.parse_run(_adv_file) _rbo = rpd_eval.rbo(run_b_rep=_base_run, run_a_rep=_adv_run) assert 'baseline' in _rbo.keys() assert rbo_base == _rbo.get('baseline') assert 'advanced' in _rbo.keys() assert rbo_adv == _rbo.get('advanced')
def test_run_ktu(): with open('./example/rpd_b.txt') as _base_file, open( './example/rpd_a.txt') as _adv_file: _base_run = pytrec_eval.parse_run(_base_file) _adv_run = pytrec_eval.parse_run(_adv_file) _ktu = rpd_eval.ktau_union(run_b_rep=_base_run, run_a_rep=_adv_run) assert 'baseline' in _ktu.keys() assert ktu_base == _ktu.get('baseline') assert 'advanced' in _ktu.keys() assert ktu_adv == _ktu.get('advanced')
def main(): parser = argparse.ArgumentParser() parser.add_argument('qrel') parser.add_argument('run') parser.add_argument('measure') args = parser.parse_args() assert os.path.exists(args.qrel) assert os.path.exists(args.run) with open(args.qrel, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.run, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator( qrel, {args.measure}) results = evaluator.evaluate(run) def print_line(measure, scope, value): #scope = query_id = topic_id print('{:25s}{:8s}{:.22f}'.format(measure, scope, value)) avg_DCG = [] for query_id, query_measures in results.items(): for measure, value in sorted(query_measures.items()): avg_DCG.append(value) print_line(measure, query_id, value) print(avg_DCG) print(mean(avg_DCG)) print(' avg of nDCG {:f}'.format(mean(avg_DCG)))
def get_metric(self, qrels: str, trec: str, metric: str = 'ndcg_cut_10', split: dict = None, split_idx: int = -1) -> Dict[str, float]: with open(qrels, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(trec, 'r') as f_run: run = pytrec_eval.parse_run(f_run) # partial evaluation if split is not None and split_idx >= 0: for qid in copy.deepcopy(run): if qid not in split[split_idx]: _ = run.pop(qid) evaluator = pytrec_eval.RelevanceEvaluator( qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) for query_id, query_measures in sorted(results.items()): pass mes = {} for measure in sorted(query_measures.keys()): mes[measure] = pytrec_eval.compute_aggregated_measure( measure, [ query_measures[measure] for query_measures in results.values() ]) return mes[metric]
def read_ranking(bow_model_path): """return BoW ranking as dict of dicts {qid: {doc_id: score, ...}, ...}""" print('read BoW ranking') with open(bow_model_path, 'r') as f: # pytrec_eval loads ranking as dict of dicts run = pytrec_eval.parse_run(f) return run
def qrel_metrics(qrel_file, run_file, metrics=('ndcg', 'map')): """Get metrics (ndcg and map by default) for a run compared to a qrel file. Arguments: qrel_file -- qrel file with ground truth data run_file -- predictions from the run metrics -- which metrics to evaluate on, can use any valid metrics that the trec_eval tool accepts Returns: metric_values -- dictionary of metric values (out of 100), rounded to two decimal places """ with open(qrel_file, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(run_file, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, set(metrics)) results = evaluator.evaluate(run) metric_values = {} for measure in sorted(metrics): res = pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()] ) metric_values[measure] = np.round(100 * res, 2) return metric_values
def cal_ndcg(qrels, trec, k): with open(qrels, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(trec, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) for query_id, query_measures in sorted(results.items()): pass mes = {} for measure in sorted(query_measures.keys()): mes[measure] = pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()]) metric = 'ndcg_cut_%d' % k if metric not in mes: print('Depth of NDCG not available.') exit() ndcg = mes[metric] return ndcg
def evaluate(eval_path, qrel_path, res_path): measures = {"map", "ndcg_cut", "recall", "P"} with open(qrel_path, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures) with open(res_path, 'r') as f_run: run = pytrec_eval.parse_run(f_run) all_metrics = evaluator.evaluate(run) metrics = { 'P_5': 0, 'P_10': 0, 'P_20': 0, 'ndcg_cut_5': 0, 'ndcg_cut_10': 0, 'ndcg_cut_20': 0, 'ndcg_cut_100': 0, 'map': 0, 'recall_100': 0 } nb_queries = len(all_metrics) for key, values in all_metrics.items(): for metric in metrics: metrics[metric] += values[metric] / nb_queries with open(eval_path, 'w') as f: json.dump(metrics, f)
def main(): rpd_eval = RpdEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=None, run_a_rep_path=None) rpd_eval.trim() rpd_eval.evaluate() for run_name, info in runs_rpd.items(): with open(info.get('path')) as run_file: info['run'] = pytrec_eval.parse_run(run_file) trim(info['run']) info['scores'] = rpd_eval.evaluate(info['run']) info['rmse'] = rpd_eval.rmse(run_b_score=info['scores']) baseline_runs = [ 'rpd_wcr04_tf_1', 'rpd_wcr04_tf_2', 'rpd_wcr04_tf_3', 'rpd_wcr04_tf_4', 'rpd_wcr04_tf_5' ] advanced_runs = [ 'rpd_wcr0405_tf_1', 'rpd_wcr0405_tf_2', 'rpd_wcr0405_tf_3', 'rpd_wcr0405_tf_4', 'rpd_wcr0405_tf_5' ] cutoffs = ['5', '10', '15', '20', '30', '100', '200', '500', '1000'] df_content = {} for run_name in baseline_runs: df_content[run_name] = [ runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co] for co in cutoffs ] df = pd.DataFrame(df_content, index=cutoffs) ax = df.plot.line(style='o-') ax.set_xlabel('Cut-off values') ax.set_ylabel('RMSE') ax.get_figure().savefig('data/plots/rpd_b_rmse.pdf', format='pdf', bbox_inches='tight') plt.show() df_content = {} for run_name in advanced_runs: df_content[run_name] = [ runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co] for co in cutoffs ] df = pd.DataFrame(df_content, index=cutoffs) ax = df.plot.line(style='o-') ax.set_xlabel('Cut-off values') ax.set_ylabel('RMSE') ax.get_figure().savefig('data/plots/rpd_a_rmse.pdf', format='pdf', bbox_inches='tight') plt.show()
def main(): parser = argparse.ArgumentParser() parser.add_argument('qrel') parser.add_argument('run', nargs=2) # A bit too strict, as it does not allow for parametrized measures, # but sufficient for the example. parser.add_argument( '--measure', #choices=pytrec_eval.supported_measures, required=True) args = parser.parse_args() assert os.path.exists(args.qrel) assert all(map(os.path.exists, args.run)) with open(args.qrel, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.run[0], 'r') as f_run: first_run = pytrec_eval.parse_run(f_run) with open(args.run[1], 'r') as f_run: second_run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, {args.measure}) first_results = evaluator.evaluate(first_run) print(first_results.keys()) second_results = evaluator.evaluate(second_run) query_ids = list(set(first_results.keys()) & set(second_results.keys())) first_scores = [ first_results[query_id][args.measure] for query_id in query_ids ] second_scores = [ second_results[query_id][args.measure] for query_id in query_ids ] print(scipy.stats.ttest_rel(first_scores, second_scores))
def pytrec_evaluation(runfile, qrelfile, measures = pytrec_eval.supported_measures): """ run trec_eval with "measures" from the Python interface """ with open(runfile, "r") as ranking: run = pytrec_eval.parse_run(ranking) with open(qrelfile, "r") as qrel: qrel = pytrec_eval.parse_qrel(qrel) evaluator = pytrec_eval.RelevanceEvaluator( qrel, measures) return evaluator.evaluate(run)
def main(): rpl_eval = RplEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=None, run_a_rep_path=None, qrel_rpd_path=QREL_RPL) rpl_eval.trim() rpl_eval.evaluate() for run_name, info in runs_rpl.items(): with open(info.get('path')) as run_file: info['run'] = pytrec_eval.parse_run(run_file) trim(info['run']) info['scores'] = rpl_eval.evaluate(info['run']) pairs = [('rpl_wcr04_tf_1', 'rpl_wcr0405_tf_1'), ('rpl_wcr04_tf_2', 'rpl_wcr0405_tf_2'), ('rpl_wcr04_tf_3', 'rpl_wcr0405_tf_3'), ('rpl_wcr04_tf_4', 'rpl_wcr0405_tf_4'), ('rpl_wcr04_tf_5', 'rpl_wcr0405_tf_5')] df_content = { 'P_10': [ rpl_eval.er(run_b_score=runs_rpl[pair[0]]['scores'], run_a_score=runs_rpl[pair[1]]['scores'])['P_10'] for pair in pairs ], 'ndcg': [ rpl_eval.er(run_b_score=runs_rpl[pair[0]]['scores'], run_a_score=runs_rpl[pair[1]]['scores'])['ndcg'] for pair in pairs ], 'map': [ rpl_eval.er(run_b_score=runs_rpl[pair[0]]['scores'], run_a_score=runs_rpl[pair[1]]['scores'])['map'] for pair in pairs ], } df = pd.DataFrame(df_content, index=['tf_1', 'tf_2', 'tf_3', 'tf_4', 'tf_5']) orig_val = 1 ax = df.plot.bar(rot=0) ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color='black') ax.annotate(' ', (3, orig_val), color='black') ax.set_xlabel("Replicated Run") ax.set_ylabel("Effect Ratio (ER)") ax.get_figure().savefig('data/plots/rpl_er.pdf', format='pdf', bbox_inches='tight') plt.show()
def compute_metrics(coll_path, Collection, queries_index, qrel, results, model_name, save_res=False): """Function that saves the results of retrieval: the top_k documents according to their score for a certain model identified by model_name. Then, it computes different metrics for IR using the pytrec_eval package""" #HR Collection.save_results(queries_index, results, model_name, top_k=1000) with open(model_name, 'r') as f_run: run = pytrec_eval.parse_run(f_run) if not save_res: os.remove(model_name) #measures = {"map", "ndcg_cut", "recall", "P"} measures = {"ndcg_cut"} evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures) all_metrics = evaluator.evaluate(run) metrics = { 'P_5': 0, 'P_10': 0, 'P_20': 0, 'ndcg_cut_5': 0, 'ndcg_cut_10': 0, 'ndcg_cut_20': 0, 'ndcg_cut_1000': 0, 'map': 0, 'recall_1000': 0 } nb_queries = len(all_metrics) for key, values in all_metrics.items(): for metric in metrics: metrics[metric] += values[metric] / nb_queries return metrics
def main(): parser = argparse.ArgumentParser() parser.add_argument('qrel') parser.add_argument('run') args = parser.parse_args() assert os.path.exists(args.qrel) assert os.path.exists(args.run) with open(args.qrel, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.run, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator( qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) def print_line(measure, scope, value): print('{:25s}{:8s}{:.4f}'.format(measure, scope, value)) for query_id, query_measures in sorted(results.items()): for measure, value in sorted(query_measures.items()): print_line(measure, query_id, value) # Scope hack: use query_measures of last item in previous loop to # figure out all unique measure names. # # TODO(cvangysel): add member to RelevanceEvaluator # with a list of measure names. for measure in sorted(query_measures.keys()): print_line( measure, 'all', pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()]))
def main(): rpd_eval = RpdEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=None, run_a_rep_path=None) rpd_eval.trim() rpd_eval.evaluate() for run_name, info in runs_rpd.items(): with open(info.get('path')) as run_file: info['run'] = pytrec_eval.parse_run(run_file) trim(info['run']) info['scores'] = rpd_eval.evaluate(info['run']) average_retrieval_performance( rpd_eval.run_b_orig_score, { 'tf_1': runs_rpd.get('rpd_wcr04_tf_1').get('scores'), 'tf_2': runs_rpd.get('rpd_wcr04_tf_2').get('scores'), 'tf_3': runs_rpd.get('rpd_wcr04_tf_3').get('scores'), 'tf_4': runs_rpd.get('rpd_wcr04_tf_4').get('scores'), 'tf_5': runs_rpd.get('rpd_wcr04_tf_5').get('scores'), }, measures=['P_10', 'ndcg', 'bpref', 'map'], xlabel='Reproduced run (wcr04)', ylabel='Score', outfile='data/plots/rpd_b_arp.pdf') average_retrieval_performance( rpd_eval.run_a_orig_score, { 'tf_1': runs_rpd.get('rpd_wcr0405_tf_1').get('scores'), 'tf_2': runs_rpd.get('rpd_wcr0405_tf_2').get('scores'), 'tf_3': runs_rpd.get('rpd_wcr0405_tf_3').get('scores'), 'tf_4': runs_rpd.get('rpd_wcr0405_tf_4').get('scores'), 'tf_5': runs_rpd.get('rpd_wcr0405_tf_5').get('scores'), }, measures=['P_10', 'ndcg', 'bpref', 'map'], xlabel='Reproduced run (wcr0405)', ylabel='Score', outfile='data/plots/rpd_a_arp.pdf')
def get_metric(self, qrels: str, trec: str, metric: str = 'ndcg_cut_10') -> Dict[str, float]: with open(qrels, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(trec, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator( qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) for query_id, query_measures in sorted(results.items()): pass mes = {} for measure in sorted(query_measures.keys()): mes[measure] = pytrec_eval.compute_aggregated_measure( measure, [ query_measures[measure] for query_measures in results.values() ]) return mes[metric]
def __test(self): with open(os.path.join(TREC_EVAL_TEST_DIR, ground_truth_filename)) as \ f_trec_eval: trec_eval_output = parse_trec_eval(f_trec_eval) measures = set( measure if measure in pytrec_eval.supported_measures else prefix_match(measure, pytrec_eval.supported_measures) for measure in trec_eval_output['all'].keys()) with open(os.path.join(TREC_EVAL_TEST_DIR, qrel_filename)) as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(os.path.join(TREC_EVAL_TEST_DIR, run_filename)) as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures, **kwargs) results = evaluator.evaluate(run) expected_measures = trec_eval_output['all'] for measure in expected_measures: agg_measure_value = pytrec_eval.compute_aggregated_measure( measure, [ query_measure_values[measure] for query_measure_values in results.values() ]) ground_truth_agg_measure_value = \ trec_eval_output['all'][measure] self.assertAlmostEqual(agg_measure_value, ground_truth_agg_measure_value, places=3, msg=measure)
def main(): os.chdir(os.path.dirname(os.path.realpath('__file__'))) # set folders corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name index_folder = 'corpus/' + FLAGS.corpus_name + '/index' # model_folder = 'corpus/' + FLAGS.corpus_name + '/models/' + FLAGS.model_name data_folder = 'corpus/' + FLAGS.corpus_name + '/data' query_folder = 'corpus/' + FLAGS.corpus_name + '/queries' qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels' rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name # create folders if not os.path.exists(rankings_folder): os.makedirs(rankings_folder) # if not os.path.exists(model_folder): # os.makedirs(model_folder) if not os.path.exists(query_folder) or not os.path.exists(qrels_folder): print( 'folders containing queries and qrels are required - please add them' ) return False # set random seed - enable reproducibility np.random.seed(FLAGS.seed) # establish connection with UMLS db umls_lookup = umls.UMLSLookup() # load required data print( 'load processed data required to retrofit word vectors and perform retrieval tasks' ) with open(data_folder + '/docs.json', 'r') as df: corpus = json.load(df) with open(data_folder + '/idfs.json', 'r') as wf: idfs = json.load(wf) with open(data_folder + '/cfs.json', 'r') as cff: cfs = json.load(cff) with open(data_folder + '/word_dict.json', 'r') as wdf: word_dict = json.load(wdf) # compute reverse word dict reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys())) # store docnos and docs as separate lists docnos = list(corpus.keys()) docs = list(corpus.values()) del corpus # free memory space # pre process relational data if not os.path.exists(data_folder + '/term2cui.json'): # map terms to cuis using QuickUMLS term2cui = tf_utils.get_term2cui(word_dict, data_folder, threshold=FLAGS.threshold, stypes_fname=FLAGS.stypes_fname) else: # laod (term, cui) pairs print('load (term, cui) pairs') with open(data_folder + '/term2cui.json', 'r') as tcf: term2cui = json.load(tcf) """ SEMANTIC PROCESSING """ # load semantic model print('load semantic model') with tf.Session() as sess: # restore model and get required tensors saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta') saver.restore(sess, FLAGS.semantic_model + '.ckpt') word_embs = sess.run(tf.get_default_graph().get_tensor_by_name( 'embeddings/word_embs:0')) """ RETROFITTING """ if FLAGS.retrofit: # get synonyms for each word within vocabulary print('get synonyms') syns = tf_utils.get_syns(term2cui, word_dict, umls_lookup) if FLAGS.syn_weights: # convert collection frequencies from list to dict cfs = dict(cfs) else: cfs = None # retrofit word vectors print('retrofit word vectors for {} iterations'.format( FLAGS.iterations)) word_embs = retrofit(word_embs, syns, reverse_word_dict, FLAGS.iterations, alpha=1.0, beta=FLAGS.beta, cfs=cfs) # compute doc embeddings print('compute document vectors w/ retrofitted word vectors') doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs, idfs) if not FLAGS.reranking: """ RETRIEVAL """ print('perform retrieval over the entire collection') # load queries q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname) # set query embs and ids q_embs = [] q_ids = [] # loop over queries and generate rankings for qid, qtext in q.items(): # prepare queries for semantic matching q_proj = tf_utils.prepare_query(qtext[FLAGS.qfield], word_dict, word_embs) if q_proj is None: print('query {} does not contain known terms'.format(qid)) else: q_embs.append(q_proj) q_ids.append(qid) q_embs = np.array(q_embs) # perform search and evaluate model effectiveness tf_utils.semantic_search(docnos, doc_embs, q_ids, q_embs, rankings_folder, FLAGS.model_name) scores = tf_utils.evaluate( ['Rprec', 'P_5', 'P_10', 'P_20', 'ndcg', 'map'], rankings_folder, FLAGS.model_name, qrels_folder, FLAGS.qrels_fname) else: """ RE-RANKING """ print('perform re-ranking over top 1000 documents from a baseline run') # parse and store qrels with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt', 'r') as qrelf: qrels = pytrec_eval.parse_qrel(qrelf) # initialize evaluator over qrels evaluator = pytrec_eval.RelevanceEvaluator( qrels, {'P'}) # evaluate on Precision # parse input run print('parse input run') with open(FLAGS.run_path, 'r') as runf: run = pytrec_eval.parse_run(runf) # load queries q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname) # get query ids qids = list(q.keys()) # shuffle query ids np.random.shuffle(qids) if FLAGS.fixed_gamma: # perform re-ranking based on a fixed value of gamma print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma)) # combine rankings using fixed gamma comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma) # store test ranking in combined run for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking( qid, [(score, docno) for docno, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma), qrels_folder, FLAGS.qrels_fname) else: # learn optimal weight to combine runs print("learn optimal weight to combine runs with sweep: {}".format( FLAGS.sweep)) # set variable to store scores and weights scores_and_weights = [] # initialize kfold with FLAGS.num_folds kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds) for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)): print('fold n. {}'.format(fold)) # restrict queries to train_qids and test_qids qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids} qtest = {qids[ix]: q[qids[ix]] for ix in test_qids} # obtain best combination on training queries train_score, best_train_weight = max( tf_utils.perform_reranking( run, FLAGS.qfield, qtrain, docnos, doc_embs, word_dict, word_embs, FLAGS.sweep, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.ref_measure, evaluator)) print( 'fold %d: best_train_weight=%.2f, %s =%.4f' % (fold, best_train_weight, FLAGS.ref_measure, train_score)) # compute combined run with best combination on test queries test_crun = tf_utils.compute_combined_run( run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_train_weight) # evaluate test run test_res = evaluator.evaluate(test_crun) # compute aggregated measure score for test queries test_score = pytrec_eval.compute_aggregated_measure( FLAGS.ref_measure, [ qscore[FLAGS.ref_measure] for qscore in test_res.values() ]) # store averaged scores w/ best weights scores_and_weights.append( (np.mean([train_score, test_score]), best_train_weight)) # get (best) weight that produces the highest averaged score best_score, best_weight = max(scores_and_weights) print('found best weight=%.2f' % (best_weight)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight)) # compute combined run based on test weight comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_weight) # store ranking in crun for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking( qid, [(score, doc_id) for doc_id, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print( 'evaluate run combined w/ {}-fold cross validation and best weight={}' .format(FLAGS.num_folds, FLAGS.best_weight)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight), qrels_folder, FLAGS.qrels_fname)
with open(qrel_path, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) evaluator = pytrec_eval.RelevanceEvaluator(qrel, {'ndcg_cut'}) redirects = {} with open(redirects_path) as f: for line in f: if not line.startswith('#'): subj, pred, obj = line.split(maxsplit=2) obj = obj[:obj.rfind('.')].strip() redirects[subj] = obj with open(ir_run_path, "r") as ir_run_file: ir_run = pytrec_eval.parse_run(ir_run_file) model = Word2Vec.load(args.model) entityv = KeyedVectors(model.vector_size * 2) entityv_entities = [] entityv_weights = [] wordv = KeyedVectors(model.vector_size * 2) wordv_entities = [] wordv_weights = [] for entity, vocab in model.wv.vocab.items(): if entity.startswith('<'): entityv_entities.append(entity) entityv_weights.append( np.concatenate( (model.syn1neg[vocab.index], model.wv.syn0[vocab.index])))
def main(): cutoffs = [1000, 100, 50, 20, 10, 5] # BASELINE for run_name, info in zip( list(runs_rpd.keys())[::2], list(runs_rpd.values())[::2]): rpd_eval = RpdEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=None, run_a_rep_path=None) rpd_eval.trim() rpd_eval.evaluate() with open(info.get('path')) as run_file: info['run'] = pytrec_eval.parse_run(run_file) for cutoff in cutoffs: rpd_eval.trim(cutoff) rpd_eval.trim(cutoff, info['run']) info['ktu_' + str(cutoff)] = arp( rpd_eval.ktau_union(info['run'])['baseline']) df_content = {} for run_name, info in zip( list(runs_rpd.keys())[::2], list(runs_rpd.values())[::2]): df_content[run_name] = [ info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1] ] ax = pd.DataFrame(data=df_content, index=[str(cutoff) for cutoff in cutoffs[::-1]]).plot(style='-*') ax.set_xlabel('Cut-off values') ax.set_ylabel(r"Kendall's $\tau$") ax.get_figure().savefig('data/plots/rpd_b_ktu.pdf', format='pdf', bbox_inches='tight') plt.show() # ADVANCED for run_name, info in zip( list(runs_rpd.keys())[1::2], list(runs_rpd.values())[1::2]): rpd_eval = RpdEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=None, run_a_rep_path=None) rpd_eval.trim() rpd_eval.evaluate() with open(info.get('path')) as run_file: info['run'] = pytrec_eval.parse_run(run_file) for cutoff in cutoffs: rpd_eval.trim(cutoff) rpd_eval.trim(cutoff, info['run']) # scores = rpl_eval.evaluate(info['run']) info['ktu_' + str(cutoff)] = arp( rpd_eval.ktau_union(info['run'])['baseline']) df_content = {} for run_name, info in zip( list(runs_rpd.keys())[1::2], list(runs_rpd.values())[1::2]): df_content[run_name] = [ info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1] ] ax = pd.DataFrame(data=df_content, index=[str(cutoff) for cutoff in cutoffs[::-1]]).plot(style='-*') ax.set_xlabel('Cut-off values') ax.set_ylabel(r"Kendall's $\tau$") ax.get_figure().savefig('data/plots/rpd_a_ktu.pdf', format='pdf', bbox_inches='tight') plt.show()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', nargs="?", type=str) args = parser.parse_args() config = json.load(open(args.config, 'r')) IR_models = [ mz.models.list_available()[i] for i in config["index_mz_models"] ] with open(config["collection_path"] + '/test/qrels', 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) evaluator = pytrec_eval.RelevanceEvaluator(qrel, set(config["measures"])) bm25_res = json.load( open(config["collection_path"] + '/test/' + 'BM25.metrics.json', 'r')) with open(config["collection_path"] + '/test/' + 'BM25.res', 'r') as f_run: bm25_run = pytrec_eval.parse_run(f_run) bm25_results = evaluator.evaluate(bm25_run) _ = "" for key, value in bm25_res.items(): if key in config["print_measures"]: _ += str(value)[:6] + " & " print('BM25 & ' + _[:-2] + '\\\\') all_res = dict() for model_class in IR_models: validation_path = config[ "collection_path"] + '/validation/' + model_class.__name__ test_path = config["collection_path"] + '/test/' + model_class.__name__ if os.path.exists(validation_path) and os.path.exists(test_path): best_model = "" best_metric = 0 for file in os.listdir(validation_path): if '.json' in file: val_res = json.load(open(validation_path + '/' + file, 'r')) if val_res[config["optim_measure"]] > best_metric: best_model = file best_metric = val_res[config["optim_measure"]] if best_model != "" and os.path.exists(test_path + '/' + best_model): test_res = json.load(open(test_path + '/' + best_model, 'r')) all_res[model_class.__name__] = [best_model, test_res] with open( config["collection_path"] + '/test/' + model_class.__name__ + '/' + best_model[:-12] + 'res', 'r') as f_run: run = pytrec_eval.parse_run(f_run) results = evaluator.evaluate(run) query_ids = list( set(bm25_results.keys()) & set(results.keys())) _ = "" for key, value in test_res.items(): if key in config["print_measures"]: bm25_scores = [ bm25_results[query_id][key] for query_id in query_ids ] scores = [ results[query_id][key] for query_id in query_ids ] test = scipy.stats.ttest_rel(bm25_scores, scores) _ += str(value)[:6] if test[0] < 0: if test[1] < 0.01 / len(config["print_measures"]): _ += "\\textsuperscript{\\textbf{++}}" elif test[1] < 0.05 / len( config["print_measures"]): _ += "\\textsuperscript{\\textbf{+}}" else: if test[1] < 0.01 / len(config["print_measures"]): _ += "\\textsuperscript{\\textbf{-\,-}}" elif test[1] < 0.05 / len( config["print_measures"]): _ += "\\textsuperscript{\\textbf{-}}" _ += " & " print(model_class.__name__ + ' & ' + _[:-2] + '\\\\')
def main(): rpd_eval = RpdEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=None, run_a_rep_path=None) rpd_eval.trim() rpd_eval.evaluate() for run_name, info in runs_rpd.items(): with open(info.get('path')) as run_file: info['run'] = pytrec_eval.parse_run(run_file) trim(info['run']) info['scores'] = rpd_eval.evaluate(info['run']) dri_er = { 'wcr_tf_1': { 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_1']['scores'], runs_rpd['rpd_wcr0405_tf_1']['scores']), 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_1']['scores'], runs_rpd['rpd_wcr0405_tf_1']['scores']) }, 'wcr_tf_2': { 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_2']['scores'], runs_rpd['rpd_wcr0405_tf_2']['scores']), 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_2']['scores'], runs_rpd['rpd_wcr0405_tf_2']['scores']) }, 'wcr_tf_3': { 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_3']['scores'], runs_rpd['rpd_wcr0405_tf_3']['scores']), 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_3']['scores'], runs_rpd['rpd_wcr0405_tf_3']['scores']) }, 'wcr_tf_4': { 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_4']['scores'], runs_rpd['rpd_wcr0405_tf_4']['scores']), 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_4']['scores'], runs_rpd['rpd_wcr0405_tf_4']['scores']) }, 'wcr_tf_5': { 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_5']['scores'], runs_rpd['rpd_wcr0405_tf_5']['scores']), 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_5']['scores'], runs_rpd['rpd_wcr0405_tf_5']['scores']) }, } measures = ['P_10', 'map', 'ndcg'] marker_color = [('o', 'b'), ('^', 'g'), ('v', 'r')] fig, ax1 = plt.subplots() ax1.set_xlabel('Effect Ratio (ER)') ax1.set_ylabel(u'Delta Relative Improvement (ΔRI)') for measure, mk in zip(measures, marker_color): ax1.plot([dri_er[r]['er'][measure] for r in dri_er.keys()], [dri_er[r]['dri'][measure] for r in dri_er.keys()], marker=mk[0], color=mk[1], linestyle='None', label=measure) ax1.tick_params(axis='y', labelcolor='k') fig.tight_layout() plt.axhline(0, color='grey') plt.axvline(1, color='grey') plt.legend() plt.title('Reproducibility') plt.savefig('data/plots/rpd_dri_vs_er.pdf', format='pdf', bbox_inches='tight') plt.show()
def main(): os.chdir(os.path.dirname(os.path.realpath('__file__'))) # set folders corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name data_folder = 'corpus/' + FLAGS.corpus_name + '/data' query_folder = 'corpus/' + FLAGS.corpus_name + '/queries' qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels' rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name # create folders if not os.path.exists(rankings_folder): os.makedirs(rankings_folder) if not os.path.exists(query_folder) or not os.path.exists(qrels_folder): print( 'folders containing queries and qrels are required - please add them' ) return False # parse and store qrels if FLAGS.qrels_fname: with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt', 'r') as qrelf: qrels = pytrec_eval.parse_qrel(qrelf) # initialize evaluator over qrels evaluator = pytrec_eval.RelevanceEvaluator( qrels, {'P'}) # evaluate on Precision else: print("please provide qrels filename") return False """ LEXICAL PREPROCESSING """ # parse input run print('parse input run') with open(FLAGS.run_path, 'r') as runf: run = pytrec_eval.parse_run(runf) """ SEMANTIC PREPROCESSING """ # load required data print( 'load processed data required to perform re-ranking over lexical model w/ semantic model' ) with open(data_folder + '/docs.json', 'r') as cf: corpus = json.load(cf) with open(data_folder + '/idfs.json', 'r') as wf: idfs = json.load(wf) with open(data_folder + '/cfs.json', 'r') as cff: cfs = json.load(cff) with open(data_folder + '/word_dict.json', 'r') as wdf: word_dict = json.load(wdf) # compute reverse word dictionary reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys())) # store docnos and docs as separate lists docnos = list(corpus.keys()) docs = list(corpus.values()) del corpus # free memory space # load semantic model print('load semantic model') with tf.Session() as sess: # restore model and get required tensors saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta') saver.restore(sess, FLAGS.semantic_model + '.ckpt') word_embs = sess.run(tf.get_default_graph().get_tensor_by_name( 'embeddings/word_embs:0')) # compute doc embeddings doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs, idfs) """ COMPUTE RE-RANKING """ # set random seed np.random.seed(FLAGS.seed) # load queries q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname) # get query ids qids = list(q.keys()) # shuffle query ids np.random.shuffle(qids) if FLAGS.fixed_gamma: # perform re-ranking based on a fixed value of gamma print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma)) # combine rankings using fixed gamma comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma) # store test ranking in combined run for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking(qid, [(score, docno) for docno, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_gamma_' + str(FLAGS.fixed_gamma), qrels_folder, FLAGS.qrels_fname) else: # learn optimal weight to combine runs print("learn optimal weight to combine runs with sweep: {}".format( FLAGS.sweep)) # set variable to store scores and weights scores_and_weights = [] # initialize kfold with FLAGS.num_folds kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds) for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)): print('fold n. {}'.format(fold)) # restrict queries to train_qids and test_qids qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids} qtest = {qids[ix]: q[qids[ix]] for ix in test_qids} # obtain best combination on training queries train_score, best_train_weight = max( tf_utils.perform_reranking(run, FLAGS.qfield, qtrain, docnos, doc_embs, word_dict, word_embs, FLAGS.sweep, SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.ref_measure, evaluator)) print('fold %d: best_train_weight=%.2f, %s =%.4f' % (fold, best_train_weight, FLAGS.ref_measure, train_score)) # compute combined run with best combination on test queries test_crun = tf_utils.compute_combined_run( run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_train_weight) # evaluate test run test_res = evaluator.evaluate(test_crun) # compute aggregated measure score for test queries test_score = pytrec_eval.compute_aggregated_measure( FLAGS.ref_measure, [qscore[FLAGS.ref_measure] for qscore in test_res.values()]) # store averaged scores w/ best weights scores_and_weights.append( (np.mean([train_score, test_score]), best_train_weight)) # get (best) weight that produces the highest averaged score best_score, best_weight = max(scores_and_weights) print('found best weight=%.2f' % (best_weight)) # initialize combined (output) run crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight)) # compute combined run based on test weight comb_run = tf_utils.compute_combined_run( run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs, SCORE_NORMALIZERS[FLAGS.normalizer], best_weight) # store ranking in crun for qid, doc_ids_and_scores in comb_run.items(): crun.add_ranking(qid, [(score, doc_id) for doc_id, score in doc_ids_and_scores.items()]) # close and store run crun.close_and_write(out_path=rankings_folder + '/' + FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight) + '.txt', overwrite=True) print('combined run stored in {}'.format(rankings_folder)) # evalaute combined run print( 'evaluate run combined w/ {}-fold cross validation and best weight={}' .format(FLAGS.num_folds, FLAGS.best_weight)) tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder, FLAGS.model_name + '_best_weight_' + str(FLAGS.best_weight), qrels_folder, FLAGS.qrels_fname)
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--qrel', default= '/Users/woffee/www/emse-apiqa/QA2021/data/QA2021_stackoverflow4_qrel.txt' ) parser.add_argument( '--run', default='/Users/woffee/www/emse-apiqa/QA2021/data/pyltr_pred.txt') args = parser.parse_args() print("args.qrel:", args.qrel) print("args.run", args.run) assert os.path.exists(args.qrel) assert os.path.exists(args.run) final_auc, final_accuracy = calc_auc(args.qrel, args.run) with open(args.qrel, 'r') as f_qrel: qrel = pytrec_eval.parse_qrel(f_qrel) with open(args.run, 'r') as f_run: run = pytrec_eval.parse_run(f_run) evaluator = pytrec_eval.RelevanceEvaluator(qrel, pytrec_eval.supported_measures) results = evaluator.evaluate(run) def print_line(measure, scope, value): print('{:25s}{:8s}{:.4f}'.format(measure, scope, value)) total = len(results.items()) sum_map = 0.0 for query_id, query_measures in sorted(results.items()): for measure, value in sorted(query_measures.items()): # print_line(measure, query_id, value) pass # Scope hack: use query_measures of last item in previous loop to # figure out all unique measure names. # # TODO(cvangysel): add member to RelevanceEvaluator # with a list of measure names. print("==========") selected_measures = [ 'map', 'recip_rank', 'P_5', 'P_10', 'P_15', 'P_20', 'recall_5', 'recall_10', 'recall_15', 'recall_20', 'ndcg' ] eva_values = {} for measure in selected_measures: eva_values[measure] = pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in results.values()]) # print_line( measure, 'all', eva_values[measure]) for measure in selected_measures: print_line(measure, 'all', eva_values[measure]) print( "%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f" % (final_auc, final_accuracy, eva_values['map'], eva_values['recip_rank'], eva_values['P_5'], eva_values['P_10'], eva_values['P_15'], eva_values['P_20'], eva_values['recall_5'], eva_values['recall_10'], eva_values['recall_15'], eva_values['recall_20'], eva_values['ndcg']))