def test_qe(self): if not pt.check_version("5.3"): self.skipTest("Requires Terrier 5.3") dataset = pt.datasets.get_dataset("vaswani") indexref = dataset.get_index() qe = pt.rewrite.QueryExpansion(indexref) br = pt.BatchRetrieve(indexref) queriesIn = pd.DataFrame([["1", "compact"]], columns=["qid", "query"]) res = br.transform(queriesIn) queriesOut = qe.transform(res) self.assertEqual(len(queriesOut), 1) query = queriesOut.iloc[0]["query"] self.assertTrue("compact^1.82230972" in query) self.assertTrue("applypipeline:off " in query) pipe = br >> qe >> br # lets go faster, we only need 18 topics. qid 16 had a tricky case t = dataset.get_topics().head(18) all_qe_res = pipe.transform(t) map_pipe = pt.Utils.evaluate(all_qe_res, dataset.get_qrels(), metrics=["map"])["map"] br_qe = pt.BatchRetrieve(indexref, controls={"qe":"on"}) map_qe = pt.Utils.evaluate(br_qe.transform(t), dataset.get_qrels(), metrics=["map"])["map"] self.assertAlmostEqual(map_qe, map_pipe, places=4)
def test_fbr_reranking(self): if not pt.check_version("5.3"): self.skipTest("Requires Terrier 5.3") # this test examines the use of ScoringMatchingWithFat JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") # we only want a candidate set of 2 documents firstpass = pt.BatchRetrieve(indexref, wmodel="BM25") % 2 pipe = firstpass >> pt.FeaturesBatchRetrieve( indexref, features=["WMODEL:DPH", "WMODEL:PL2"]) input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query']) result = pipe.transform(input) self.assertTrue("qid" in result.columns) self.assertTrue("docno" in result.columns) self.assertTrue("score" in result.columns) self.assertTrue("features" in result.columns) self.assertEqual(2, len(result)) self.assertEqual(result.iloc[0]["features"].size, 2) pipe_simple = firstpass >> (pt.BatchRetrieve(indexref, wmodel="DPH")** pt.BatchRetrieve(indexref, wmodel="PL2")) result2 = pipe.transform(input) import numpy as np f1 = np.stack(result["features"].values) f2 = np.stack(result2["features"].values) self.assertTrue(np.array_equal(f1, f2))
def test_threading_selfupgrade(self): if not pt.check_version("5.5"): self.skipTest("Requires Terrier 5.5") topics = pt.get_dataset("vaswani").get_topics().head(10) #this test ensures we can upgrade the indexref to be concurrent JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") retr = pt.BatchRetrieve(indexref, threads=5) result = retr.transform(topics)
def test_2_docs_title_body_meta(self): sample_dir = os.path.join(self.here, "fixtures", "sample_docs") files = pt.io.find_files(sample_dir) indexer = pt.FilesIndexer(self.test_dir, meta={ "docno": 20, "filename": 512, "title": 20, "body": 40 }, meta_tags={ "title": "title", "body": "ELSE" }) indexref = indexer.index(files) index = pt.IndexFactory.of(indexref) # check index size self.assertEqual( 2, index.getCollectionStatistics().getNumberOfDocuments()) print(index.getMetaIndex().getAllItems(0)) print(index.getMetaIndex().getAllItems(1)) # determine file locations as docids html_file = os.path.join(sample_dir, "a.html") html_pos = files.index(html_file) txt_file = os.path.join(sample_dir, "b.txt") txt_pos = files.index(txt_file) self.assertTrue(html_pos < len(files)) # test filename -> docid lookup self.assertEqual( html_pos, index.getMetaIndex().getDocument("filename", html_file)) # test docid -> filename lookup self.assertEqual(html_file, index.getMetaIndex().getItem("filename", html_pos)) # test title has been recorded in metaindex self.assertEqual("test title", index.getMetaIndex().getItem("title", html_pos)) from pyterrier import check_version if not check_version("5.5"): return # test bodies have been recorded in metaindex self.assertEqual("test body", index.getMetaIndex().getItem("body", html_pos)) self.assertEqual("empty text document", index.getMetaIndex().getItem("body", txt_pos))
def test_num_results(self): JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") retr = pt.BatchRetrieve(indexref, num_results=10) input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query']) result = retr.transform(input) self.assertEqual(len(result), 10) if not pt.check_version("5.5"): return retr = pt.BatchRetrieve(indexref, num_results=1001) result = retr.search("results") self.assertEqual(len(result), 1001)
def test_fbr_reranking2(self): if not pt.check_version("5.4"): self.skipTest("Requires Terrier 5.4") # this test examines the use of ScoringMatchingWithFat, using a particular case known to with Terrier 5.3 JIR = pt.Class('org.terrier.querying.IndexRef') indexref = JIR.of(self.here + "/fixtures/index/data.properties") # we only want a candidate set of 3 documents firstpass = pt.BatchRetrieve(indexref, wmodel="BM25") % 3 pipe1 = firstpass >> pt.FeaturesBatchRetrieve(indexref, features=["WMODEL:PL2"]) pipe2 = firstpass >> pt.BatchRetrieve(indexref, wmodel="PL2") input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query']) result0 = firstpass.transform(input) result1 = pipe1.transform(input) result2 = pipe2.transform(input) result1["feature0"] = result1.apply(lambda row: row["features"][0], axis=1) #BM25 score result0_map = {row.docno: row.score for row in result0.itertuples()} result1S_map = {row.docno: row.score for row in result1.itertuples()} #PL2 score result1F_map = { row.docno: row.feature0 for row in result1.itertuples() } result2_map = {row.docno: row.score for row in result2.itertuples()} print(result1F_map) print(result2_map) # check features scores # NB: places can go no less than 4, as two documents have similar PL2 scores for rank, row in enumerate(result0.itertuples()): docno = row.docno # check that score is unchanged self.assertAlmostEqual( result1S_map[docno], result0_map[docno], msg="input score mismatch at rank %d for docno %s" % (rank, docno), places=4) # check that feature score is correct self.assertAlmostEqual( result1F_map[docno], result2_map[docno], msg="feature score mismatch at rank %d for docno %s" % (rank, docno), places=4)
def test_candidate_set_two_doc(self): if not pt.check_version("5.3"): self.skipTest("Requires Terrier 5.3") indexloc = self.here + "/fixtures/index/data.properties" # docid 50 == docno 51 # docid 66 == docno 67 input_set = pd.DataFrame([["q1", "light", 50], ["q1", None, 66]], columns=["qid", "query", "docid"]) retr = pt.BatchRetrieve(indexloc) result = retr.transform(input_set) self.assertTrue("qid" in result.columns) self.assertTrue("docno" in result.columns) self.assertTrue("score" in result.columns) self.assertEqual(2, len(result))
def test_qe(self): if not pt.check_version("5.3"): self.skipTest("Requires Terrier 5.3") dataset = pt.datasets.get_dataset("vaswani") indexref = dataset.get_index() index = pt.IndexFactory.of(indexref) # given their defaults, there three expressions are identical, all use Bo1 qe1 = pt.rewrite.QueryExpansion(index) qe2 = pt.rewrite.DFRQueryExpansion(index) qe3 = pt.rewrite.Bo1QueryExpansion(index) # lets go faster, we only need 18 topics. qid 16 had a tricky case t = dataset.get_topics().head(18) qrels = dataset.get_qrels() for qe in [qe1, qe2, qe3]: br = pt.BatchRetrieve(index) queriesIn = pd.DataFrame([["1", "compact"]], columns=["qid", "query"]) res = br.transform(queriesIn) queriesOut = qe.transform(res) self.assertEqual(len(queriesOut), 1) self.assertTrue("query_0" in queriesOut.columns) self.assertEqual(queriesOut.iloc[0]["query_0"], "compact") query = queriesOut.iloc[0]["query"] self.assertTrue("compact^1.82230972" in query) self.assertTrue("applypipeline:off " in query) pipe = br >> qe >> br # check the pipe doesnt cause an error str(pipe) all_qe_res = pipe.transform(t) map_pipe = pt.Utils.evaluate(all_qe_res, qrels, metrics=["map"])["map"] br_qe = pt.BatchRetrieve(indexref, controls={"qe": "on"}) map_qe = pt.Utils.evaluate(br_qe.transform(t), qrels, metrics=["map"])["map"] self.assertAlmostEqual(map_qe, map_pipe, places=4)
def test_threading_manualref(self): if not pt.check_version("5.5"): self.skipTest("Requires Terrier 5.5") topics = pt.get_dataset("vaswani").get_topics().head(10) #this test ensures that we operate when the indexref is specified to be concurrent JIR = pt.autoclass('org.terrier.querying.IndexRef') indexref = JIR.of("concurrent:" + self.here + "/fixtures/index/data.properties") retr = pt.BatchRetrieve(indexref, threads=5) result = retr.transform(topics) #check that use of a callback model works under threading Tf = lambda keyFreq, posting, entryStats, collStats: posting.getFrequency( ) retr = pt.BatchRetrieve(indexref, threads=5, wmodel=Tf) result = retr.transform(topics)
def test_candidate_set_one_doc(self): if not pt.check_version("5.3"): self.skipTest("Requires Terrier 5.3") indexloc = self.here + "/fixtures/index/data.properties" # docid 50 == docno 51 input_set = pd.DataFrame([["q1", "light", 50]], columns=["qid", "query", "docid"]) retr = pt.BatchRetrieve(indexloc) # this test the implementation of __call__() redirecting to transform() for result in [retr.transform(input_set), retr(input_set)]: result = retr.transform(input_set) self.assertTrue("qid" in result.columns) self.assertTrue("docno" in result.columns) self.assertTrue("score" in result.columns) self.assertEqual(1, len(result)) row = result.iloc[0] self.assertEqual("q1", row["qid"]) self.assertEqual("51", row["docno"]) self.assertTrue(row["score"] > 0)
def test_sdm(self): if not pt.check_version("5.3"): self.skipTest("Requires Terrier 5.3") self._sdm(False)