def eval(qrel_file_path, run_file_path): """[summary] Arguments: qrel_file_path {[string]} -- [path of the qrel file usually located at the source language folder] run_file_path {[string]} -- [path of the run file usually located at the results folder of a language] Returns: [type] -- [precision@10, precision@20, precision@30, mAP rounded up to four digits] """ r1 = TrecRun(run_file_path) qrels = TrecQrel(qrel_file_path) te = TrecEval(r1, qrels) p5 = te.get_precision(depth=5) p10 = te.get_precision(depth=10) p20 = te.get_precision(depth=20) map = te.get_map() rprec = te.get_rprec() run_object = r1.evaluate_run(qrels, per_query=True) return round(p5, 4), round(p10, 4), round(p20, 4), round(map, 4), round(rprec, 4)
def report_run_per_query(qrels, run_file_name, remove_docs_with_zero_score=False): run = TrecRun(run_file_name) system = run.run_data['system'][0] if remove_docs_with_zero_score: run.run_data = run.run_data[run.run_data['score'] > 0] trec_eval = TrecEval(run, qrels) bpref = trec_eval.getBpref(per_query=True) ndcg_10 = trec_eval.getNDCG(depth=10, per_query='query') ndcg = trec_eval.getNDCG(per_query='query') ret = bpref.join(ndcg_10, on='query') ret = ret.join(ndcg, on='query') for query, r in ret.iterrows(): yield json.dumps({ 'corpus': extract_corpus(run_file_name), 'topic': query, 'tag': system, "bpref": r['Bpref@1000'], "pseudoNDCG@10": r['NDCG@10'], "pseudoNDCG": r['NDCG@1000'] })
def test_documents_to_remove_for_leave_one_out_with_single_topic_and_multiple_runs_reverse( cls): run_01 = TrecRun('test/resources/sample-run-file-01') run_02 = TrecRun('test/resources/sample-run-file-02') actual = identify_judgments_to_remove_for_leave_one_out( [run_02, run_01]) verify(actual.to_csv(header=False))
def run_file_to_jsonl(input_file, output_file): from trectools import TrecRun import json with open(output_file, 'w') as out: queries = TrecRun(input_file).run_data.groupby('query') for query in queries.groups: out.write( json.dumps([ i[1].to_dict() for i in queries.get_group(query).iterrows() ]) + '\n')
def setUp(self): run1 = TrecRun("./files/r4.run") qrels1 = TrecQrel("./files/qrel1.txt") run2 = TrecRun("./files/input.uic0301") qrels2 = TrecQrel("./files/robust03_cs_qrels.txt") # Contains the first 30 documents for the first 10 topics in input.uic0301 run3 = TrecRun("./files/input.uic0301_top30") self.commontopics = [303, 307, 310, 314, 320, 322, 325, 330, 336, 341] self.teval1 = TrecEval(run1, qrels1) self.teval2 = TrecEval(run2, qrels2) self.teval3 = TrecEval(run3, qrels2)
def compute_map(valid_codes, pred, gs_out_path=None): """ Custom function to compute MAP evaluation metric. Code adapted from https://github.com/TeMU-BSC/CodiEsp-Evaluation-Script/blob/master/codiespD_P_evaluation.py """ # Input args default values if gs_out_path is None: gs_out_path = './intermediate_gs_file.txt' pred_out_path = './intermediate_predictions_file.txt' ###### 2. Format predictions as TrecRun format: ###### format_predictions(pred, pred_out_path, valid_codes) ###### 3. Calculate MAP ###### # Load GS from qrel file qrels = TrecQrel(gs_out_path) # Load pred from run file run = TrecRun(pred_out_path) # Calculate MAP te = TrecEval(run, qrels) MAP = te.get_map(trec_eval=False) # With this option False, rank order is taken from the given document order ###### 4. Return results ###### return MAP
def run(self, index, topics, debug=True, model="PL2", ndocs=1000, result_dir=None, result_file="trec_terrier.run", terrierc=None, qexp=False, expTerms=5, expDocs=3, expModel="Bo1", showoutput=False): if result_dir is None: # Current dir is used if result_dir is not set result_dir = os.getcwd() cmd = "%s batchretrieve -t %s -w %s -Dtrec.results=%s -o %s" % (self.bin_path, topics, model, result_dir, result_file) cmd += " -Dmatching.retrieved_set_size=%d -Dtrec.output.format.length=%d " % (ndocs,ndocs) if terrierc is not None: cmd += " -c c:%d " % (terrierc) if qexp == True: cmd += " -q -Dexpansion.terms=%d -Dexpansion.documents=%d -c qemodel:%s" % (expTerms, expDocs, expModel) if showoutput == False: cmd += (" > %s 2> %s" % (os.devnull, os.devnull)) if debug: print("Running: %s " % (cmd)) r = sarge.run(cmd).returncode if r == 0: return TrecRun(os.path.join(result_dir, result_file)) else: print("ERROR with command %s" % (cmd)) return None
def run(self, index, metadata, documents_vector, terms_vector, topics, topic_format="TREC", index_type="block_simdbp", algorithm="block_max_wand", result_dir=None, result_file="trec_pisa.run", ndocs=1000, showerrors=True, debug=True): if result_dir is None: # Current dir is used if result_dir is not set result_dir = os.getcwd() outpath = "" if result_dir is not None and result_file is not None: outpath = os.path.join(result_dir, result_file) elif result_file is not None: outpath = result_file self.extract_topics(topics, topic_format) cmd = "%s/evaluate_queries -t %s -a %s -i %s -w %s --documents %s --terms %s -k %s -q topics.title" % (self.bin_path, index_type, algorithm, index, metadata, documents_vector, terms_vector, ndocs) if showerrors == True: cmd += (" > %s " % (outpath)) else: cmd += (" 2> %s > %s " % (os.devnull, outpath)) if debug: print("Running: %s " % (cmd)) r = sarge.run(cmd).returncode if r == 0: return TrecRun(os.path.join(result_dir, result_file)) else: print("ERROR with command %s" % (cmd)) return None
def make_pool_from_files(filenames, strategy="topX", topX=10, rbp_strategy="sum", rbp_p=0.80, rrf_den=60): """ Creates a pool object (TrecPool) from a list of filenames. ------ strategy = (topX, rbp, rrf). Default: topX * TOP X options: topX = Integer Value. The number of documents per query to make the pool. * RBP options: topX = Integer Value. The number of documents per query to make the pool. Default 10. rbp_strategy = (max, sum). Only in case strategy=rbp. Default: "sum" rbp_p = A float value for RBP's p. Only in case strategy=rbp. Default: 0.80 * RRF options: rrf_den = value for the Reciprocal Rank Fusion denominator. Default: 60 """ runs = [] for fname in filenames: runs.append(TrecRun(fname)) return make_pool(runs, strategy, topX=topX, rbp_p=rbp_p, rbp_strategy=rbp_strategy, rrf_den=rrf_den)
def fuse_runs(run1, run2): # https://dl.acm.org/doi/10.1145/1571941.1572114 r1 = TrecRun(f"runs/{run1}") r2 = TrecRun(f"runs/{run2}") # Perform reciprocal rank fusion. fused_run = fusion.reciprocal_rank_fusion([r1, r2], max_docs=100) # Clear names. name1 = run1.replace(".txt", "") name2 = run2.replace(".txt", "") # Save fused file to disk. fused_run.print_subset( f"runs/fuse_{name1}_{name2}.txt", topics=fused_run.topics() )
def list_of_runs_from_path(path, suffix="*"): runs = [] for r in glob(os.path.join(path, suffix)): tr = TrecRun(r) runs.append(tr) print("Found %s runs in path %s" % (len(runs), path)) return runs
def report_run(qrels, run_file_name, remove_docs_with_zero_score=False): run = TrecRun(run_file_name) system = run.run_data['system'][0] if remove_docs_with_zero_score: run.run_data = run.run_data[run.run_data['score'] > 0] trec_eval = TrecEval(run, qrels) ret = { 'corpus': extract_corpus(run_file_name), 'topics': extract_topics(run_file_name), 'tag': system, "bpref": trec_eval.getBpref(), "pseudoNDCG@10": trec_eval.getNDCG(depth=10), "pseudoNDCG": trec_eval.getNDCG() } return json.dumps(ret)
def trec_eval_ndcg(run_name, data_path='./data/', depths=[5, 10, 15, 20, 30, 100, 200, 500, 1000]): qrel_name = os.path.join(data_path, '2019qrels-pass.txt') qrel = TrecQrel(qrel_name) res = TrecRun(run_name) for depth in depths: score = TrecEval(res, qrel).get_ndcg(depth=depth) print('ndcg_cur_%d \t all \t %.4f' % (depth, score))
def run(self, index, topics, model="LM", server=None, stopper=None, result_dir=None, result_file="trec_indri.run", ndocs=1000, qexp=False, expTerms=5, expDocs=3, showerrors=True, debug=True, queryOffset=1): if result_dir is None: # Current dir is used if result_dir is not set result_dir = os.getcwd() outpath = "" if result_dir is not None and result_file is not None: outpath = os.path.join(result_dir, result_file) elif result_file is not None: outpath = result_file cmd = "%s/IndriRunQuery %s -index=%s -trecFormat=true -queryOffset=%d " % ( self.bin_path, topics, index, queryOffset) # Specify number of documents to retrieve cmd += " -count=%d " % (ndocs) if server is not None: cmd += " -server=%s " % (server) if stopper is not None: cmd += " -stopper.word=%s " % (stopper) if qexp == True: cmd += " -fbDocs=%d -fbTerms=%d " % (expTerms, expDocs) if showerrors == True: cmd += (" > %s " % (outpath)) else: cmd += (" 2> %s > %s " % (os.devnull, outpath)) if debug: print("Running: %s " % (cmd)) r = sarge.run(cmd).returncode if r == 0: return TrecRun(os.path.join(result_dir, result_file)) else: print("ERROR with command %s" % (cmd)) return None
def trec_eval(file): r1 = TrecRun(file) qrels = TrecQrel("./dataset/.txt") results = TrecEval(r1, qrels) p5 = results.get_precision(5) p10 = results.get_precision(10) p15 = results.get_precision(15) print(p5) print(p10) print(p15)
def collect(qrelsFilePath, baseDir): qrels = TrecQrel(qrelsFilePath) result = {} for i, [topicPath, topicNum] in enumerate( sorted(_getDirectoryContent(baseDir, directory=True), key=lambda a_b: int(a_b[1]))): for modelPath, modelName in _getDirectoryContent(topicPath, directory=True): modelName = modelName[:-4] if modelName not in result: result[modelName] = {} for filePath, fileName in _getDirectoryContent(modelPath, file=True): score = 0 # only evaluate non empty files if os.path.getsize(filePath) > 0: run = TrecRun(filePath) runResult = run.evaluate_run(qrels, True) rs = list( runResult.get_results_for_metric('P_10').values()) score = np.mean(rs) if fileName not in result[modelName]: result[modelName][fileName] = [score] else: result[modelName][fileName].append(score) print("Finished processing model {} of topic {}".format( modelName, topicNum)) print("Finished processing topic: ", topicNum) # Calculate average over all topics for modelName in result: for comparisonName in result[modelName]: result[modelName][comparisonName] = sum( result[modelName][comparisonName]) / len( result[modelName][comparisonName]) return result
def main(gs_path, pred_path, codes_path): ''' Load GS, predictions and valid codes; format GS and predictions according to TREC specifications; compute MAP and print it. Parameters ---------- gs_path : str Path to Gold Standard TSV with 2 columns: filename, code It has no headers row. pred_path : str Path to Gold Standard TSV with 2 columns: filename, code It has no headers row. codes_path : str Path to TSV file with valid codes. It has no headers row. Returns ------- None. ''' ###### 0. Load valid codes lists: ###### valid_codes = set( pd.read_csv(codes_path, sep='\t', header=None, usecols=[0])[0].tolist()) valid_codes = set([x.lower() for x in valid_codes]) ###### 1. Format GS as TrecQrel format: ###### qid_gs = format_gs(gs_path, './intermediate_gs_file.txt') ###### 2. Format predictions as TrecRun format: ###### format_predictions(pred_path, './intermediate_predictions_file.txt', valid_codes, qid_gs) ###### 3. Calculate MAP ###### # Load GS from qrel file qrels = TrecQrel('./intermediate_gs_file.txt') # Load pred from run file run = TrecRun('./intermediate_predictions_file.txt') # Calculate MAP te = TrecEval(run, qrels) MAP = te.get_map( trec_eval=False ) # With this option False, rank order is taken from the given document order ###### 4. Show results ###### print('\nMAP estimate: {}\n'.format(round(MAP, 3))) #print('\n{}'.format(round(MAP, 3))) print('{}|{}'.format(pred_path, round(MAP, 3)))
def trec_eval(runs_file_path: Path or str, qrels_file_path: Path or str): metrics = dict() r1 = TrecRun(str(runs_file_path.absolute())) qrels = TrecQrel(str(qrels_file_path.absolute())) results = TrecEval(r1, qrels) metrics["P@5"] = results.get_precision(5) metrics["P@10"] = results.get_precision(10) metrics["P@15"] = results.get_precision(15) metrics["bpref"] = results.get_bpref() metrics["map"] = results.get_map() metrics = {k: round(v, 4) for k, v in metrics.items()} return metrics
def reciprocal_rank_fusion(trec_runs, k=60, max_docs=1000, output=sys.stdout): """ Implements a reciprocal rank fusion as define in ``Reciprocal Rank fusion outperforms Condorcet and individual Rank Learning Methods`` by Cormack, Clarke and Buettcher. Parameters: k: term to avoid vanishing importance of lower-ranked documents. Default value is 60 (default value used in their paper). output: a file pointer to write the results. Sys.stdout is the default. """ outputRun = TrecRun() rows = [] topics = trec_runs[0].topics() for topic in sorted(topics): doc_scores = {} for r in trec_runs: docs_for_run = r.get_top_documents(topic, n=1000) for pos, docid in enumerate(docs_for_run, start=1): doc_scores[docid] = doc_scores.get(docid, 0.0) + 1.0 / (k + pos) # Writes out information for this topic for rank, (docid, score) in enumerate(sorted(iter(doc_scores.items()), key=lambda x: (-x[1], x[0]))[:max_docs], start=1): # output.write("%s Q0 %s %d %f reciprocal_rank_fusion_k=%d\n" % (str(topic), docid, rank, score, k)) rows.append((topic, "Q0", docid, rank, score, "reciprocal_rank_fusion_k=%d" % k)) df = pd.DataFrame(rows) df.columns = ["query", "q0", "docid", "rank", "score", "system"] df["q0"] = df["q0"].astype(np.str) outputRun.run_data = df.copy() return outputRun
class TestTrecRun(unittest.TestCase): def setUp(self): self.run = TrecRun("./files/r1.run") def tearDown(self): pass def test_topics(self): topics = self.run.topics() self.assertItemsEqual(topics, [1,2]) def test_get_filename(self): self.assertEqual(self.run.get_filename(), "r1.run") def test_get_full_filename_path(self): fullname = self.run.get_full_filename_path() self.assertTrue("/files/r1.run" in fullname) def test_topics_intersection_with(self): another_run = TrecRun("./files/r2.run") intersection = self.run.topics_intersection_with(another_run) self.assertItemsEqual(intersection, [1]) def test_get_top_documents(self): topic1_top2 = self.run.get_top_documents(1, n=2) topic2_top2 = self.run.get_top_documents(2, n=2) self.assertItemsEqual(topic1_top2, ["doc1_1", "doc1_2"]) self.assertItemsEqual(topic2_top2, ["doc2_1", "doc2_3"]) def test_get_mean_coverage(self): #trecqrel = TrecQrel("./files/qrel1.txt") #print(self.run.get_mean_coverage(trecqrel)) pass def test_check_qrel_coverage(self): #self.run.check_qrel_coverage(self, trecqrel, topX=10) pass
def main(args): gold_labels = TrecQrel(args.gold_labels) prediction = TrecRun(args.scores) results = TrecEval(prediction, gold_labels) metrics = extract_metrics(results, args.metrics) metrics.loc[:, '@depth'] = metrics.loc[:, '@depth'].astype(str) metrics.loc[:, '@depth'] = metrics.loc[:, '@depth'].replace(str(MAX_DEPTH), 'all') if args.output: metrics.to_csv(args.output, sep="\t", index=False) logger.info(f"Saved results to {args.output}") else: print(metrics.to_string(index=False))
def report_run(qrels, corpus, topics, run_file_name): run = TrecRun(run_file_name) trec_eval = TrecEval(run, qrels) ret = { 'corpus': corpus, 'topics': topics, 'tag': run.run_data['system'][0], "bpref": trec_eval.getBpref(), "pseudoNDCG@10": trec_eval.getNDCG(depth=10, removeUnjudged=True), "pseudoNDCG": trec_eval.getNDCG(removeUnjudged=True), } return json.dumps(ret)
class TestTrecRun(unittest.TestCase): def setUp(self): self.run = TrecRun("./files/r1.run") def tearDown(self): pass def test_topics(self): topics = self.run.topics() self.assertListEqual(topics, [1, 2]) def test_get_filename(self): self.assertEqual(self.run.get_filename(), "r1.run") def test_get_full_filename_path(self): fullname = self.run.get_full_filename_path() self.assertTrue("/files/r1.run" in fullname) def test_topics_intersection_with(self): another_run = TrecRun("./files/r2.run") intersection = self.run.topics_intersection_with(another_run) self.assertSetEqual(intersection, set([1])) def test_get_top_documents(self): topic1_top2 = self.run.get_top_documents(1, n=2) topic2_top2 = self.run.get_top_documents(2, n=2) self.assertListEqual(topic1_top2, ["doc1_1", "doc1_2"]) self.assertListEqual(topic2_top2, ["doc2_1", "doc2_3"]) def test_get_mean_coverage(self): #trecqrel = TrecQrel("./files/qrel1.txt") #print(self.run.get_mean_coverage(trecqrel)) pass def test_check_qrel_coverage(self): #self.run.check_qrel_coverage(self, trecqrel, topX=10) pass
def main(args): format_check_passed = run_checks(args.scores) if not format_check_passed: return gold_labels = TrecQrel(args.gold_labels) prediction = TrecRun(args.scores) results = TrecEval(prediction, gold_labels) metrics = extract_metrics(results, args.metrics, args.depths) metrics.loc[:, '@depth'] = metrics.loc[:, '@depth'].astype(str) metrics.loc[:, '@depth'] = metrics.loc[:, '@depth'].replace(str(MAX_DEPTH), 'all') if args.output: metrics.to_csv(args.output, sep='\t', index=False) logger.info(f'Saved results to file: {args.output}') else: print(metrics.to_string(index=False))
def evaluate(qrels, runs_file, topics, model): runs = TrecRun(runs_file) ev = TrecEval(runs, qrels) path_to_csv = os.path.join("eval", model, "results.csv") n_topics = len(topics) # Calculate various metrics for each query considering the runs/judgment files provided print("Calculating metrics...") res = ev.evaluate_all(per_query=True) # Write results of evaluation to csv file res.printresults(path_to_csv, "csv", perquery=True) # Calculate NDCG@100 for each query, since the previous metrics don't include it, # and append it to each line of the new csv file ndcgs = ev.get_ndcg(depth=100, per_query=True) values = [row['NDCG@100'] for i, row in ndcgs.iterrows() ] # Column name of Pandas dataframe storing the data with open(path_to_csv, 'r') as f: lines = [line[:-1] for line in f] # Remove '\n' from the end of each line lines[0] += ",ndcg@100\n" # Add new column to header for i in range( 1, n_topics + 1 ): # Lines 1 to n contain metric values for each of the n queries lines[i] += "," + str( values[i - 1] ) + "\n" # Line 1 (i) should store value 0 (i-1) - arrays start at 0 global_ndcg = ev.get_ndcg(depth=100, per_query=False) # Calculate global NDCG lines[n_topics + 1] += "," + str( global_ndcg) + "\n" # Append global NDCG to last line with open(path_to_csv, 'w') as f: f.writelines(lines) # Overwrite csv file with new content
def setUp(self): self.run = TrecRun("./files/r1.run")
def test_topics_intersection_with(self): another_run = TrecRun("./files/r2.run") intersection = self.run.topics_intersection_with(another_run) self.assertSetEqual(intersection, set([1]))
from trectools import TrecRun, TrecQrel from trectools import procedures import glob import os task1_run_filepath = "../runs_t1/" qrels_top = "../qrels/task1.qrels" filepath = glob.glob(os.path.join(task1_run_filepath, "*.txt")) topqrels = TrecQrel(qrels_top) results = [] for filename in filepath: r = TrecRun(filename) res = r.evaluate_run(topqrels) results.append(res) p10 = procedures.get_results(results, "P_10") procedures.plot_system_rank("task1_p10.jpg", p10, "P@10") bpref = procedures.get_results(results, "bpref") procedures.plot_system_rank("task1_bpref.jpg", bpref, "BPREF") map_ = procedures.get_results(results, "map") procedures.plot_system_rank("task1_map.jpg", map_, "MAP")
def createCLEFTrecRun(r): tr = TrecRun(r) tr.__class__ = CLEFTrecRun tr.modify_query_ids() return tr
from trectools import TrecRun, TrecEval, TrecQrel, fusion r1 = TrecRun( "/storage/proj/petra/projects/podcasts/experiments/experiment5/test_output.5" ) r2 = TrecRun( "/storage/proj/petra/projects/podcasts/experiments/experiment5/test_output.6" ) # Easy way to create new baselines by fusing existing runs: #fused_run = fusion.reciprocal_rank_fusion([r1,r2]) fused_run = fusion.combos([r1, r2], strategy="mnz") print(fused_run) qrels_file = "/storage/proj/petra/projects/podcasts/podcasts_2020_train.1-8.qrels" qrels = TrecQrel(qrels_file) r1_p10 = TrecEval(r1, qrels).get_precision(depth=10) # P@25: 0.3392 r2_p10 = TrecEval(r2, qrels).get_precision(depth=10) # P@25: 0.2872 fused_run_p10 = TrecEval(fused_run, qrels).get_precision(depth=10) # P@25: 0.3436 r1_map = TrecEval(r1, qrels).get_map() # P@25: 0.3392 r2_map = TrecEval(r2, qrels).get_map() # P@25: 0.2872 fused_run_map = TrecEval(fused_run, qrels).get_map() r1_ndcg = TrecEval(r1, qrels).get_ndcg() # P@25: 0.3392 r2_ndcg = TrecEval(r2, qrels).get_ndcg() # P@25: 0.2872 fused_run_ndcg = TrecEval(fused_run, qrels).get_ndcg() print("NDCG -- Run 1: %.3f, Run 2: %.3f, Fusion Run: %.3f" %
elif retrieval_approach == 'lr': mypath = "../../data/runs/iterative_lr_run_dir/clean" else: mypath = "../../data/runs/iterative_lr_ir_run_dir/" run_files = [ join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f)) and f.endswith('.run') ] print run_files p_10 = np.zeros(30) p_20 = np.zeros(30) count = 0 for run_file in run_files: run = TrecRun(run_file) #print 'run loaded' res = run.evaluate_run(myQrel) #print 'run evaluated' keys = [item for item in res.get_results_for_metric("P_20").keys()] keys = sorted(keys, key=int) values_p20 = [res.get_results_for_metric("P_20")[i] for i in keys] values_p20 = np.asarray(values_p20) p_20 += values_p20 keys = [item for item in res.get_results_for_metric("P_10").keys()] keys = sorted(keys, key=int) values_p10 = [res.get_results_for_metric("P_10")[i] for i in keys] values_p10 = np.asarray(values_p10) p_10 += values_p10 count += 1
def load_trec_runs(paths: List[str]) -> List[TrecRun]: print(f'Loading {len(paths)} runs') return [TrecRun(path) for path in paths]
def map(self): qrels_file = TrecQrel("./Data/qrel.txt") path_to_runs = TrecRun("./Data/run.txt") te = TrecEval(path_to_runs, qrels_file) dic = {"map": te.get_map(), "ndcg": te.get_ndcg()} return dic