def test_msmarco_doc_ance_bf_otf(self): output_file = 'test_run.msmarco-doc.passage.ance-maxp.otf.txt ' self.temp_files.append(output_file) cmd1 = f'python -m pyserini.dsearch --topics msmarco-doc-dev \ --index msmarco-doc-ance-maxp-bf \ --encoder castorini/ance-msmarco-doc-maxp \ --output {output_file}\ --hits 1000 \ --max-passage \ --max-passage-hits 100 \ --msmarco \ --batch-size {self.batch_size} \ --threads {self.threads}' cmd2 = f'python -m pyserini.eval.msmarco_doc_eval --judgments msmarco-doc-dev --run {output_file}' status = os.system(cmd1) stdout, stderr = run_command(cmd2) score = parse_score(stdout, "MRR @100") self.assertEqual(status, 0) # We get a small difference, 0.3794 on macOS. self.assertAlmostEqual(score, 0.3797, delta=0.0003)
def test_core17_lr(self): pyserini_topics = 'core17' run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} ' \ + f'--topics {pyserini_topics} --output {self.tmp}/core17_lr.txt ' \ + f'--prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7' status = os.system(run_file_cmd) self.assertEqual(status, 0) score_cmd = f'{self.anserini_root}/tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30 \ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \ {self.tmp}/core17_lr.txt' status = os.system(score_cmd) stdout, stderr = run_command(score_cmd) score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertEqual(stderr, '') self.assertAlmostEqual(score, 0.2473, delta=0.0001)
def test_msmarco_doc_tct_colbert_bf_otf(self): output_file = 'test_run.msmarco-doc.passage.tct_colbert.txt' self.temp_files.append(output_file) cmd1 = f'python -m pyserini.dsearch --topics msmarco-doc-dev \ --index msmarco-doc-tct_colbert-bf \ --encoder castorini/tct_colbert-msmarco \ --output {output_file} \ --hits 1000 \ --max-passage \ --max-passage-hits 100 \ --msmarco \ --batch-size {self.batch_size} \ --threads {self.threads}' cmd2 = f'python -m pyserini.eval.msmarco_doc_eval --judgments msmarco-doc-dev --run {output_file}' status = os.system(cmd1) stdout, stderr = run_command(cmd2) score = parse_score(stdout, "MRR @100") self.assertEqual(status, 0) self.assertEqual(stderr, '') self.assertAlmostEqual(score, 0.3323, places=4)
def test_sum_aggregation(self): os.system('python -m pyserini.search.lucene.irst \ --qrels tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt \ --tran_path irst_test/ibm_model_1_bert_tok_20211117/ \ --query_path irst_test/queries.dev.small.json \ --index msmarco-passage-ltr \ --output irst_test/regression_test_sum.txt \ --alpha 0.1 ') score_cmd = f'{self.pyserini_root}/tools/eval/trec_eval.9.0.4/trec_eval \ -c -M1000 -m map -m ndcg_cut.20 {self.qrels_path} irst_test/regression_test_sum.txt' status = os.system(score_cmd) stdout, stderr = run_command(score_cmd) map_score = parse_score(stdout, "map") ndcg_score = parse_score(stdout, "ndcg") self.assertEqual(status, 0) self.assertEqual(stderr, '') self.assertEqual(map_score, 0.2294) self.assertEqual(ndcg_score, 0.2997)
def test_msmarco_v2_doc_unicoil_noexp_otf(self): output_file = 'test_run.msmarco-v2-doc.unicoil-noexp.0shot.otf.tsv' self.temp_files.append(output_file) cmd1 = f'python -m pyserini.search.lucene --topics msmarco-v2-doc-dev \ --encoder castorini/unicoil-noexp-msmarco-passage \ --index msmarco-v2-doc-per-passage-unicoil-noexp-0shot \ --output {output_file} \ --impact \ --hits 10000 \ --batch {self.batch_size} \ --threads {self.threads} \ --max-passage-hits 1000 \ --max-passage \ --min-idf 1' cmd2 = f'python -m pyserini.eval.trec_eval -c -M 100 -m map -m recip_rank msmarco-v2-doc-dev {output_file}' status = os.system(cmd1) stdout, stderr = run_command(cmd2) score = parse_score(stdout, "recip_rank") self.assertEqual(status, 0) self.assertAlmostEqual(score, 0.2032, delta=0.0001)
def test_dpr_trivia_test_bf(self): output_file = 'test_run.dpr.trivia-test.multi.bf.trec' retrieval_file = 'test_run.dpr.trivia-test.multi.bf.json' self.temp_files.extend([output_file, retrieval_file]) cmd1 = f'python -m pyserini.dsearch --topics dpr-trivia-test \ --index wikipedia-dpr-multi-bf \ --output {output_file} \ --batch-size {self.batch_size} --threads {self.threads}' cmd2 = f'python scripts/dpr/convert_trec_run_to_retrieval_json.py --topics dpr-trivia-test \ --index wikipedia-dpr \ --input {output_file} \ --output {retrieval_file}' cmd3 = f'python tools/scripts/dpr/evaluate_retrieval.py --retrieval {retrieval_file} --topk 20' status1 = os.system(cmd1) status2 = os.system(cmd2) stdout, stderr = run_command(cmd3) score = parse_score(stdout, "Top20") self.assertEqual(status1, 0) self.assertEqual(status2, 0) self.assertAlmostEqual(score, 0.7887, places=4)
def test_msmarco_passage_tct_colbert_bf(self): output_file = 'test_run.msmarco-passage.tct_colbert.bf.tsv' self.temp_files.append(output_file) cmd1 = f'python -m pyserini.dsearch --topics msmarco-passage-dev-subset \ --index msmarco-passage-tct_colbert-bf \ --batch-size {self.batch_size} \ --threads {self.threads} \ --output {output_file} \ --msmarco' cmd2 = f'python tools/scripts/msmarco/msmarco_passage_eval.py \ tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt \ {output_file}' status = os.system(cmd1) stdout, stderr = run_command(cmd2) score = parse_score(stdout, "MRR @10") self.assertEqual(status, 0) self.assertEqual(stderr, '') # We get a small difference in scores on macOS (vs. Linux): if platform.system() == 'Darwin': self.assertAlmostEqual(score, 0.3349, places=4) else: self.assertAlmostEqual(score, 0.3350, places=4)
def test_nq_test_ance_bf_otf(self): output_file = 'test_run.ance.nq-test.multi.bf.otf.trec' retrieval_file = 'test_run.ance.nq-test.multi.bf.otf.json' self.temp_files.extend([output_file, retrieval_file]) cmd1 = f'python -m pyserini.dsearch --topics dpr-nq-test \ --index wikipedia-ance-multi-bf \ --encoder castorini/ance-dpr-question-multi \ --output {output_file} \ --batch-size {self.batch_size} --threads {self.threads}' cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-nq-test \ --index wikipedia-dpr \ --input {output_file} \ --output {retrieval_file}' cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20' status1 = os.system(cmd1) status2 = os.system(cmd2) stdout, stderr = run_command(cmd3) score = parse_score(stdout, "Top20") self.assertEqual(status1, 0) self.assertEqual(status2, 0) self.assertAlmostEqual(score, 0.8224, places=4)
def test_msmarco_passage_ance_rocchio_prf_otf(self): output_file = 'test_run.dl2019.ance.rocchio-prf.otf.trec' self.temp_files.append(output_file) cmd1 = f'python -m pyserini.search.faiss --topics dl19-passage \ --index msmarco-passage-ance-bf \ --encoder castorini/ance-msmarco-passage \ --batch-size {self.batch_size} \ --threads {self.threads} \ --output {output_file} \ --prf-depth 5 \ --prf-method rocchio \ --threads {self.threads} \ --rocchio-alpha {self.rocchio_alpha} \ --rocchio-beta {self.rocchio_beta}' cmd2 = f'python -m pyserini.eval.trec_eval -l 2 -m map dl19-passage {output_file}' print(cmd1) print(cmd2) status = os.system(cmd1) stdout, stderr = run_command(cmd2) score = parse_score(stdout, "map") self.assertEqual(status, 0) self.assertAlmostEqual(score, 0.4211, delta=0.0001)
def test_dpr_curated_test_bf_otf(self): output_file = 'test_run.dpr.curated-test.multi.bf.otf.trec' retrieval_file = 'test_run.dpr.curated-test.multi.bf.otf.json' self.temp_files.extend([output_file, retrieval_file]) cmd1 = f'python -m pyserini.dsearch --topics dpr-curated-test \ --index wikipedia-dpr-multi-bf \ --encoder facebook/dpr-question_encoder-multiset-base \ --output {output_file} \ --batch-size {self.batch_size} --threads {self.threads}' cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-curated-test \ --index wikipedia-dpr \ --input {output_file} \ --output {retrieval_file}' cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20 --regex' status1 = os.system(cmd1) status2 = os.system(cmd2) stdout, stderr = run_command(cmd3) score = parse_score(stdout, "Top20") self.assertEqual(status1, 0) self.assertEqual(status2, 0) self.assertAlmostEqual(score, 0.8876, places=4)
def run(self, runtag: str, pyserini_extras: str, actualscore: float): print('-------------------------') print(f'Running {runtag}:') print('-------------------------') pyserini_output = f'verify.pyserini.{runtag}.txt' pyserini_cmd = f'{self.pyserini_base_cmd} --index {self.index_path} ' \ + f'--topics {self.pyserini_topics} --output {pyserini_output} {pyserini_extras}' status = os.system(pyserini_cmd) if not status == 0: return False eval_cmd = f'{self.eval_base_cmd} {self.qrels} {pyserini_output}' status = os.system(eval_cmd) if not status == 0: return False stdout, stderr = run_command(eval_cmd) score = parse_score(stdout, "map") if actualscore != score: return False return True
def test_dpr_curated_test_bf_bm25_hybrid(self): output_file = 'test_run.dpr.curated-test.multi.bf.bm25.trec' retrieval_file = 'test_run.dpr.curated-test.multi.bf.bm25.json' self.temp_files.extend([output_file, retrieval_file]) cmd1 = f'python -m pyserini.hsearch dense --index wikipedia-dpr-multi-bf \ sparse --index wikipedia-dpr \ fusion --alpha 1.05 \ run --topics dpr-curated-test \ --batch-size {self.batch_size} --threads {self.threads} \ --output {output_file} ' cmd2 = f'python scripts/dpr/convert_trec_run_to_retrieval_json.py --topics dpr-curated-test \ --index wikipedia-dpr \ --input {output_file} \ --output {retrieval_file}' cmd3 = f'python tools/scripts/dpr/evaluate_retrieval.py --retrieval {retrieval_file} --topk 20 --regex' status1 = os.system(cmd1) status2 = os.system(cmd2) stdout, stderr = run_command(cmd3) score = parse_score(stdout, "Top20") self.assertEqual(status1, 0) self.assertEqual(status2, 0) self.assertAlmostEqual(score, 0.9006, places=4)