示例#1
0
    def test_cache_compose_cache(self):
        pt.cache.CACHE_DIR = self.test_dir
        import pandas as pd
        queries = pd.DataFrame([["q1", "chemical"]], columns=["qid", "query"])
        br1 = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(), wmodel="TF_IDF")
        br2 = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(), wmodel="BM25")
        cache = ~ (~br1 >> br2)
        self.assertEqual(0, len(cache.chest._keys))
        cache(queries)
        cache(queries)
        self.assertEqual(0.5, cache.stats())

        #lets see if another cache of the same object would see the same cache entries.
        cache2 = ~(~br1 >> br2)
        cache2(queries)
        self.assertEqual(1, cache2.stats())
        
        # check that the cache report works
        all_report = pt.cache.list_cache()
        self.assertTrue(len(all_report) > 0)
        report = list(all_report.values())[0]
        self.assertEqual(1, report["queries"])
        self.assertTrue("transformer" in report)
        self.assertTrue("size" in report)
        self.assertTrue("lastmodified" in report)
        
        pt.cache.CACHE_DIR = None
示例#2
0
 def test_mrt(self):
     brs = [
         pt.BatchRetrieve(pt.datasets.get_dataset("vaswani").get_index(),
                          wmodel="DPH"),
         pt.BatchRetrieve(pt.datasets.get_dataset("vaswani").get_index(),
                          wmodel="BM25")
     ]
     topics = pt.datasets.get_dataset("vaswani").get_topics().head(10)
     qrels = pt.datasets.get_dataset("vaswani").get_qrels()
     pt.Experiment(brs,
                   topics,
                   qrels,
                   eval_metrics=["map", "mrt"],
                   baseline=0,
                   highlight="color")
     pt.Experiment(brs,
                   topics,
                   qrels,
                   eval_metrics=["map", "mrt"],
                   highlight="color")
     pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"])
     pt.Experiment(brs,
                   topics,
                   qrels,
                   eval_metrics=["map", "mrt"],
                   baseline=0,
                   highlight="color")
示例#3
0
    def test_fbr_reranking(self):
        if not pt.check_version("5.3"):
            self.skipTest("Requires Terrier 5.3")
        # this test examines the use of ScoringMatchingWithFat
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        # we only want a candidate set of 2 documents
        firstpass = pt.BatchRetrieve(indexref, wmodel="BM25") % 2
        pipe = firstpass >> pt.FeaturesBatchRetrieve(
            indexref, features=["WMODEL:DPH", "WMODEL:PL2"])
        input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
        result = pipe.transform(input)
        self.assertTrue("qid" in result.columns)
        self.assertTrue("docno" in result.columns)
        self.assertTrue("score" in result.columns)
        self.assertTrue("features" in result.columns)
        self.assertEqual(2, len(result))
        self.assertEqual(result.iloc[0]["features"].size, 2)

        pipe_simple = firstpass >> (pt.BatchRetrieve(indexref, wmodel="DPH")**
                                    pt.BatchRetrieve(indexref, wmodel="PL2"))
        result2 = pipe.transform(input)
        import numpy as np
        f1 = np.stack(result["features"].values)
        f2 = np.stack(result2["features"].values)
        self.assertTrue(np.array_equal(f1, f2))
        def _check_index(index_small):
            if has_direct1:
                contents1 = TestBackground.get_contents(0, index_small)
                self.assertEqual(contents1, contents_big)

            inv1 = index_small.getInvertedIndex()
            print(inv1.getClass().getName())
            lex1 = index_small.getLexicon()
            for t in contents_big:
                pointer = lex1[t]
                print(pointer.toString())
                p = inv1.getPostings(pointer)
                print(p.getClass().getName())
                rtr = p.next()
                self.assertEqual(0, rtr)
                self.assertEqual(
                    p.getDocumentLength(),
                    index_big.getDocumentIndex().getDocumentLength(1047))
                self.assertEqual(contents_big[t], p.getFrequency())
                self.assertEqual(p.next(), p.EOL)

            from jnius import JavaException
            try:
                br1 = pt.BatchRetrieve(index_small, wmodel="Tf")
                brall = pt.BatchRetrieve(index_big, wmodel="Tf")
                with_doc = pd.DataFrame(
                    [["q1", q, "1048", 1047]],
                    columns=["qid", "query", "docno", "docid"])
                rtr1 = br1.transform(q)
            except JavaException as ja:
                print(ja.stacktrace)
                raise ja
            rtrall = brall(with_doc)
            self.assertTrue(
                np.array_equal(rtr1["score"].values, rtrall["score"].values))
示例#5
0
    def test_save_docs_CE(self):
        index = pt.get_dataset("vaswani").get_index()
        dph = pt.BatchRetrieve(index, wmodel="DPH")
        pipe = dph \
            >> pt.rewrite.stash_results() \
            >> pt.BatchRetrieve(index, wmodel="BM25") \
            >> pt.rewrite.Bo1QueryExpansion(index) \
            >> pt.rewrite.reset_results() \
            >> dph
        rtr1 = dph.search("chemical reactions")
        rtr2 = pipe.search("chemical reactions")
        # Bo1 should be applied as a re-ranker, hence the
        # number of docs in rtr1 and rtr2 should be equal
        self.assertEqual(len(rtr1), len(rtr2))

        # check columns are passed through where we expect
        pipeP3 = dph \
            >> pt.rewrite.stash_results() \
            >> pt.BatchRetrieve(index, wmodel="BM25")
        res3 = pipeP3.search("chemical reactions")
        self.assertIn("stashed_results_0", res3.columns)
        pipeP4 = dph \
            >> pt.rewrite.stash_results() \
            >> pt.BatchRetrieve(index, wmodel="BM25") \
            >> pt.rewrite.Bo1QueryExpansion(index)
        res4 = pipeP3.search("chemical reactions")
        self.assertIn("stashed_results_0", res4.columns)
示例#6
0
    def test_qe(self):
        if not pt.check_version("5.3"):
            self.skipTest("Requires Terrier 5.3")
        dataset = pt.datasets.get_dataset("vaswani")
        indexref = dataset.get_index()

        qe = pt.rewrite.QueryExpansion(indexref)
        br = pt.BatchRetrieve(indexref)

        queriesIn = pd.DataFrame([["1", "compact"]], columns=["qid", "query"])
        res = br.transform(queriesIn)

        queriesOut = qe.transform(res)
        self.assertEqual(len(queriesOut), 1)
        query = queriesOut.iloc[0]["query"]
        self.assertTrue("compact^1.82230972" in query)
        self.assertTrue("applypipeline:off " in query)
        
        pipe = br >> qe >> br

        # lets go faster, we only need 18 topics. qid 16 had a tricky case
        t = dataset.get_topics().head(18)

        all_qe_res = pipe.transform(t)
        map_pipe = pt.Utils.evaluate(all_qe_res, dataset.get_qrels(), metrics=["map"])["map"]

        br_qe = pt.BatchRetrieve(indexref, controls={"qe":"on"})
        map_qe = pt.Utils.evaluate(br_qe.transform(t), dataset.get_qrels(), metrics=["map"])["map"]

        self.assertAlmostEqual(map_qe, map_pipe, places=4)
示例#7
0
    def _sdm(self, freq):
        dataset = pt.datasets.get_dataset("vaswani")
        indexer = pt.TRECCollectionIndexer(self.test_dir, blocks=True)
        indexref = indexer.index(dataset.get_corpus())

        if freq:
            sdm = pt.rewrite.SDM(prox_model="org.terrier.matching.models.Tf")
        else:
            sdm = pt.rewrite.SDM()
       
        queriesIn = pd.DataFrame([["1", "compact"], ["2", "compact memories"]], columns=["qid", "query"])
        queriesOut = sdm.transform(queriesIn)
        self.assertEqual(len(queriesOut), 2)
        self.assertEqual(queriesOut.iloc[0]["query"], "compact")
        query2 = queriesOut.iloc[1]["query"]
        self.assertTrue("#1" in query2)
        self.assertTrue("#uw8" in query2)
        self.assertTrue("#combine" in query2)
        
        br_normal = pt.BatchRetrieve(indexref)
        pipe = sdm >> br_normal
        if freq:
            br_normal.controls["wmodel"] = "Tf"
        resTest_pipe = pipe.transform(queriesIn)


        # this BR should do the same thing as the pipe, but natively in Terrier
        br_sdm = pt.BatchRetrieve(indexref,
            controls = {"sd" :"on"}, 
            properties={"querying.processes" : "terrierql:TerrierQLParser,parsecontrols:TerrierQLToControls,"\
                    +"parseql:TerrierQLToMatchingQueryTerms,matchopql:MatchingOpQLParser,applypipeline:ApplyTermPipeline,"\
                    +"sd:DependenceModelPreProcess,localmatching:LocalManager$ApplyLocalMatching,qe:QueryExpansion,"\
                    +"labels:org.terrier.learning.LabelDecorator,filters:LocalManager$PostFilterProcess"})
        if freq:
            br_sdm.controls["wmodel"] = "Tf"
            br_sdm.controls["dependencemodel"] = "org.terrier.matching.models.Tf"

        resTest_native = br_sdm.transform(queriesIn)
 
        #print (resTest_pipe[resTest_pipe["qid"]=="2"])
        #print (resTest_native[resTest_native["qid"]=="2"])
        for index, row in resTest_pipe.iterrows():
            #print(index)
            #print(row["query"])
            #print(row)
            #print(resTest_native.iloc[index]) 
            self.assertEqual(row['qid'], resTest_native.iloc[index]["qid"])
            self.assertEqual(row['docno'], resTest_native.iloc[index]["docno"])
            # TODO I cannot get this test to pass with freq=False more precisely than 1dp
            #9.165638 in resTest_pipe vs 9.200683 in resTest_native
            self.assertAlmostEqual(row['score'], resTest_native.iloc[index]["score"], 1)

        t = dataset.get_topics().head(5)
        pipe_res = pipe.transform(t)
        #br_normal.saveResult(pipe_res, "/tmp/sdm.res", run_name="DPH")

        self.assertAlmostEqual(
            pt.Utils.evaluate(pipe_res, dataset.get_qrels(), metrics=["map"])["map"], 
            pt.Utils.evaluate(br_sdm.transform(t), dataset.get_qrels(), metrics=["map"])["map"], 
            places=4)
示例#8
0
 def test_wrong(self):
     brs = [
         pt.BatchRetrieve(pt.datasets.get_dataset("vaswani").get_index(), wmodel="DPH"), 
         pt.BatchRetrieve(pt.datasets.get_dataset("vaswani").get_index(), wmodel="BM25")
     ]
     topics = pt.datasets.get_dataset("vaswani").get_topics().head(10)
     qrels =  pt.datasets.get_dataset("vaswani").get_qrels()
     with self.assertRaises(TypeError):
         pt.Experiment(brs, topics, qrels, eval_metrics=["map"], filter_qrels=True)
示例#9
0
 def test_baseline(self):
     dataset = pt.get_dataset("vaswani")
     df = pt.Experiment(
         [pt.BatchRetrieve(dataset.get_index(), wmodel="BM25"), pt.BatchRetrieve(dataset.get_index(), wmodel="DPH")], 
         dataset.get_topics().head(10), 
         dataset.get_qrels(),
         eval_metrics=["map", "ndcg"], 
         baseline=0)
     self.assertTrue("map +" in df.columns)
     self.assertTrue("map -" in df.columns)
     self.assertTrue("map p-value" in df.columns)
示例#10
0
 def test_parallel_joblib_ops(self):
     dataset = pt.get_dataset("vaswani")
     topics = dataset.get_topics().head(3)
     dph = pt.BatchRetrieve(dataset.get_index())
     tf = pt.BatchRetrieve(dataset.get_index(), wmodel="Tf")
     for pipe in [
             dph, dph % 10, dph >> tf, dph + tf,
             pt.apply.query(lambda row: row["query"] + " chemical") >> dph
     ]:
         res1 = pipe(topics)
         res2 = pipe.parallel(3)(topics)
         self.assertEqual(len(res1), len(res2))
示例#11
0
 def test_mrt(self):
     index = pt.datasets.get_dataset("vaswani").get_index()
     brs = [
         pt.BatchRetrieve(index, wmodel="DPH"), 
         pt.BatchRetrieve(index, wmodel="BM25")
     ]
     topics = pt.datasets.get_dataset("vaswani").get_topics().head(10)
     qrels =  pt.datasets.get_dataset("vaswani").get_qrels()
     measures = ["map", "mrt"]
     pt.Experiment(brs, topics, qrels, eval_metrics=measures)
     self.assertTrue("mrt" in measures)
     pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"], highlight="color")
     pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"], baseline=0, highlight="color")
示例#12
0
    def test_baseline_and_tests(self):
        dataset = pt.get_dataset("vaswani")
        numt=10
        res1 = pt.BatchRetrieve(dataset.get_index(), wmodel="BM25")(dataset.get_topics().head(numt))
        res2 = pt.BatchRetrieve(dataset.get_index(), wmodel="DPH")(dataset.get_topics().head(numt))

        # t-test
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            df = pt.Experiment(
                [res1, res2], 
                dataset.get_topics().head(numt), 
                dataset.get_qrels(),
                eval_metrics=["map", "ndcg"], 
                baseline=0)
            self.assertTrue("map +" in df.columns)
            self.assertTrue("map -" in df.columns)
            self.assertTrue("map p-value" in df.columns)

            # wilcoxon signed-rank test
            df = pt.Experiment(
                [res1, res2], 
                dataset.get_topics().head(numt), 
                dataset.get_qrels(),
                eval_metrics=["map", "ndcg"], 
                test='wilcoxon', 
                baseline=0)
            self.assertTrue("map +" in df.columns)
            self.assertTrue("map -" in df.columns)
            self.assertTrue("map p-value" in df.columns)


        # user-specified TOST
        # TOST will omit warnings here, due to low numbers of topics
        import statsmodels.stats.weightstats
        fn = lambda X,Y: (0, statsmodels.stats.weightstats.ttost_ind(X, Y, -0.01, 0.01)[0])
        
        #This filter doesnt work
        with warnings.catch_warnings(record=True) as w:
            warnings.filterwarnings("always")
            df = pt.Experiment(
                [res1, res2], 
                dataset.get_topics().head(numt), 
                dataset.get_qrels(),
                eval_metrics=["map", "ndcg"], 
                test=fn,
                baseline=0)
            print(w)
        self.assertTrue("map +" in df.columns)
        self.assertTrue("map -" in df.columns)
        self.assertTrue("map p-value" in df.columns)
示例#13
0
    def test_fbr_reranking2(self):
        if not pt.check_version("5.4"):
            self.skipTest("Requires Terrier 5.4")
        # this test examines the use of ScoringMatchingWithFat, using a particular case known to with Terrier 5.3
        JIR = pt.Class('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        # we only want a candidate set of 3 documents
        firstpass = pt.BatchRetrieve(indexref, wmodel="BM25") % 3
        pipe1 = firstpass >> pt.FeaturesBatchRetrieve(indexref,
                                                      features=["WMODEL:PL2"])
        pipe2 = firstpass >> pt.BatchRetrieve(indexref, wmodel="PL2")

        input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
        result0 = firstpass.transform(input)
        result1 = pipe1.transform(input)
        result2 = pipe2.transform(input)

        result1["feature0"] = result1.apply(lambda row: row["features"][0],
                                            axis=1)
        #BM25 score
        result0_map = {row.docno: row.score for row in result0.itertuples()}
        result1S_map = {row.docno: row.score for row in result1.itertuples()}
        #PL2 score
        result1F_map = {
            row.docno: row.feature0
            for row in result1.itertuples()
        }
        result2_map = {row.docno: row.score for row in result2.itertuples()}

        print(result1F_map)
        print(result2_map)

        # check features scores
        # NB: places can go no less than 4, as two documents have similar PL2 scores
        for rank, row in enumerate(result0.itertuples()):
            docno = row.docno
            # check that score is unchanged
            self.assertAlmostEqual(
                result1S_map[docno],
                result0_map[docno],
                msg="input score mismatch at rank %d for docno %s" %
                (rank, docno),
                places=4)
            #  check that feature score is correct
            self.assertAlmostEqual(
                result1F_map[docno],
                result2_map[docno],
                msg="feature score mismatch at rank %d for docno %s" %
                (rank, docno),
                places=4)
示例#14
0
    def test_num_results(self):
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        retr = pt.BatchRetrieve(indexref, num_results=10)
        input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
        result = retr.transform(input)
        self.assertEqual(len(result), 10)

        if not pt.check_version("5.5"):
            return

        retr = pt.BatchRetrieve(indexref, num_results=1001)
        result = retr.search("results")
        self.assertEqual(len(result), 1001)
示例#15
0
 def test_compile_to_fbr(self):
     indexref = pt.IndexRef.of(self.here + "/fixtures/index/data.properties")
     # we only want a candidate set of 2 documents
     firstpass = pt.BatchRetrieve(indexref, wmodel="BM25")
     pipe_f_fbr = firstpass >> pt.FeaturesBatchRetrieve(indexref, features=["WMODEL:DPH", "WMODEL:PL2"])
     pipe_fbr = pt.FeaturesBatchRetrieve(indexref, wmodel="BM25", features=["WMODEL:DPH", "WMODEL:PL2"])
     pipe_raw = firstpass >> ( pt.BatchRetrieve(indexref, wmodel="DPH") ** pt.BatchRetrieve(indexref, wmodel="PL2") )
     input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
     res1 = (pipe_f_fbr %2)(input)
     res2 = (pipe_fbr % 2)(input)
     res3 = (pipe_raw % 2)(input)
     compiled = (pipe_raw % 2).compile()
     print(repr(compiled))
     res4 = compiled(input)
示例#16
0
 def test_baseline_corrected(self):
     dataset = pt.get_dataset("vaswani")
     for corr in ['hs', 'bonferroni', 'holm-sidak']:            
         df = pt.Experiment(
             [pt.BatchRetrieve(dataset.get_index(), wmodel="BM25"), pt.BatchRetrieve(dataset.get_index(), wmodel="DPH")], 
             dataset.get_topics().head(10), 
             dataset.get_qrels(),
             eval_metrics=["map", "ndcg"], 
             baseline=0, correction='hs')
         self.assertTrue("map +" in df.columns)
         self.assertTrue("map -" in df.columns)
         self.assertTrue("map p-value" in df.columns)
         self.assertTrue("map p-value corrected" in df.columns)
         self.assertTrue("map reject" in df.columns)
示例#17
0
    def test_various_metrics(self):
        topics = pt.datasets.get_dataset("vaswani").get_topics().head(10)
        res = [
            pt.BatchRetrieve(pt.datasets.get_dataset("vaswani").get_index(),
                             wmodel="DPH")(topics),
            pt.BatchRetrieve(pt.datasets.get_dataset("vaswani").get_index(),
                             wmodel="BM25")(topics)
        ]

        qrels = pt.datasets.get_dataset("vaswani").get_qrels()
        # what we ask for -> what we should get as a metric
        family2measure = {
            'ndcg_cut_5': 'ndcg_cut_5',
            'P': "P@5",
            'P_5': "P_5",
            "iprec_at_recall": "[email protected]",
            "official": "AP",
            "set": "SetP",
            "recall": "R@5",
            "recall_1000": "recall_1000"
        }
        # what we ask for -> what we should NOT get
        family2black = {
            'ndcg_cut_5': 'ndcg_cut_10',
            'P_5': "P_100",
            "recall_1000": "recall_5"
        }
        for m in family2measure:
            df1 = pt.Experiment(res, topics, qrels, eval_metrics=[m])
            df2 = pt.Experiment(res,
                                topics,
                                qrels,
                                eval_metrics=[m],
                                baseline=0)
            df3 = pt.Experiment(res,
                                topics,
                                qrels,
                                eval_metrics=[m],
                                perquery=True)
            self.assertIn(family2measure[m], df1.columns)
            self.assertIn(family2measure[m], df2.columns)
            self.assertTrue(len(df3[df3["measure"] == family2measure[m]]) > 0)

            # check that we dont get back measures that we did NOT ask for
            if m in family2black:
                self.assertNotIn(family2black[m], df1.columns)
                self.assertNotIn(family2black[m], df2.columns)
                self.assertTrue(
                    len(df3[df3["measure"] == family2black[m]]) == 0)
示例#18
0
 def test_save(self):
     index = pt.datasets.get_dataset("vaswani").get_index()
     brs = [
         pt.BatchRetrieve(index, wmodel="DPH"), 
         pt.BatchRetrieve(index, wmodel="BM25")
     ]
     topics = pt.datasets.get_dataset("vaswani").get_topics().head(10)
     qrels =  pt.datasets.get_dataset("vaswani").get_qrels()
     df1 = pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"], save_dir=self.test_dir)
     # check save_dir files are there
     self.assertTrue(os.path.exists(os.path.join(self.test_dir, "BR(DPH).res.gz")))
     self.assertTrue(os.path.exists(os.path.join(self.test_dir, "BR(BM25).res.gz")))
     df2 = pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"], save_dir=self.test_dir)
     # a successful experiment using save_dir should be faster
     self.assertTrue(df2.iloc[0]["mrt"] < df1.iloc[0]["mrt"])
示例#19
0
 def test_num_python_wmodel(self):
     indexref = self.here + "/fixtures/index/data.properties"
     Tf = lambda keyFreq, posting, entryStats, collStats: posting.getFrequency(
     )
     retr = pt.BatchRetrieve(indexref, wmodel=Tf)
     input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
     result = retr.transform(input)
示例#20
0
    def test_two_term_query_correct_qid_docid_score(self):
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        retr = pt.BatchRetrieve(indexref)
        input = pd.DataFrame([["1", "Stability"], ["2", "Generator"]],
                             columns=['qid', 'query'])
        result = retr.transform(input)
        exp_result = parse_res_file(
            os.path.dirname(os.path.realpath(__file__)) +
            "/fixtures/two_queries_result")
        for index, row in result.iterrows():
            self.assertEqual(row['qid'], exp_result[index][0])
            self.assertEqual(row['docno'], exp_result[index][1])
            self.assertAlmostEqual(row['score'], exp_result[index][2])

        input = pd.DataFrame([[1, "Stability"], [2, "Generator"]],
                             columns=['qid', 'query'])
        result = retr.transform(input)
        exp_result = parse_res_file(
            os.path.dirname(os.path.realpath(__file__)) +
            "/fixtures/two_queries_result")
        for index, row in result.iterrows():
            self.assertEqual(str(row['qid']), exp_result[index][0])
            self.assertEqual(row['docno'], exp_result[index][1])
            self.assertAlmostEqual(row['score'], exp_result[index][2])
示例#21
0
    def rank_publications(self, query, page, rpp):

        itemlist = []

        if query is not None:
            if self.idx is None:
                try:
                    self.idx = pt.IndexFactory.of('./index/data.properties')
                except Exception as e:
                    print('No index available: ', e)

            if self.idx is not None:
                topics = pd.DataFrame.from_dict({'qid': [0], 'query': [query]})
                retr = pt.BatchRetrieve(self.idx, controls={"wmodel": "TF_IDF"})
                retr.setControl("wmodel", "TF_IDF")
                retr.setControls({"wmodel": "TF_IDF"})
                res = retr.transform(topics)
                itemlist = list(res['docno'][page * rpp:(page + 1) * rpp])

        return {
            'page': page,
            'rpp': rpp,
            'query': query,
            'itemlist': itemlist,
            'num_found': len(itemlist)
        }
示例#22
0
    def test_vaswani(self):
        import pyterrier as pt
        dataset = pt.datasets.get_dataset("vaswani")
        self.assertIsNotNone(dataset)

        topics = dataset.get_topics()
        self.assertIsNotNone(topics)
        self.assertEqual(len(topics), 93)

        qrels = dataset.get_qrels()
        self.assertIsNotNone(qrels)
        self.assertEqual(len(qrels), 2083)

        indexref = dataset.get_index()
        self.assertIsNotNone(indexref)
        with pt.IndexFactory.of(indexref) as index:
            self.assertIsNotNone(index)
            self.assertEqual(
                index.getCollectionStatistics().getNumberOfDocuments(), 11429)

        # do it once again, to ensure it works locally
        dataset = pt.datasets.get_dataset("vaswani")
        topics = dataset.get_topics()
        self.assertIsNotNone(topics)
        self.assertEqual(len(topics), 93)

        # test the newer get_topicsqrels
        pt.Experiment([pt.BatchRetrieve(dataset.get_index())],
                      *dataset.get_topicsqrels(), ["map"])
    def __init__(self,
                 candidates,
                 num_candidates_samples,
                 path_index,
                 sample_data,
                 set_rm3=False,
                 seed=42):
        random.seed(seed)
        self.candidates = candidates
        self.candidates_df = pd.DataFrame(
            self.candidates,
            columns=["candidate"
                     ]).reset_index().rename(columns={'index': 'docno'})
        self.candidates_df['docno'] = self.candidates_df['docno'].astype(str)
        self.num_candidates_samples = num_candidates_samples
        self.path_index = path_index
        if set_rm3:
            self.name = "BM25RM3NS_pyterrier"
        else:
            self.name = "BM25NS_pyterrier"
        self.sample_data = sample_data
        self._create_index()

        self.bm25_pipeline = pt.BatchRetrieve(
            self.indexref, wmodel="BM25") % self.num_candidates_samples
示例#24
0
    def test_batching(self):
        vaswani = pt.datasets.get_dataset("vaswani")
        br = pt.BatchRetrieve(vaswani.get_index())
        rtr1 = pt.Experiment([br],
                             vaswani.get_topics().head(10),
                             vaswani.get_qrels(),
                             ["map", "ndcg", "num_q", "mrt"])
        rtr2 = pt.Experiment([br],
                             vaswani.get_topics().head(10),
                             vaswani.get_qrels(),
                             ["map", "ndcg", "num_q", "mrt"],
                             batch_size=2)
        self.assertTrue("mrt" in rtr1.columns)
        self.assertTrue("mrt" in rtr2.columns)
        rtr1.drop(columns=["mrt"], inplace=True)
        rtr2.drop(columns=["mrt"], inplace=True)
        pd.testing.assert_frame_equal(rtr1, rtr2)

        rtr1 = pt.Experiment([br],
                             vaswani.get_topics().head(10),
                             vaswani.get_qrels(), ["map", "ndcg", "num_q"],
                             perquery=True)
        rtr2 = pt.Experiment([br],
                             vaswani.get_topics().head(10),
                             vaswani.get_qrels(), ["map", "ndcg", "num_q"],
                             batch_size=2,
                             perquery=True)
        pd.testing.assert_frame_equal(rtr1, rtr2)
示例#25
0
 def test_bad_measure(self):
     vaswani = pt.datasets.get_dataset("vaswani")
     br = pt.BatchRetrieve(vaswani.get_index())
     with self.assertRaises(KeyError):
         pt.Experiment([br],
                       vaswani.get_topics().head(10), vaswani.get_qrels(),
                       [map])
示例#26
0
 def test_parallel_joblib_experiment(self):
     self.skip_windows()
     dataset = pt.get_dataset("vaswani")
     br = pt.BatchRetrieve(dataset.get_index())
     df = pt.Experiment([br, br.parallel(3)], dataset.get_topics(),
                        dataset.get_qrels(), ["map", "mrt"])
     self.assertEqual(df.iloc[0]["map"], df.iloc[1]["map"])
示例#27
0
    def test_perquery(self):
        vaswani = pt.datasets.get_dataset("vaswani")
        br = pt.BatchRetrieve(vaswani.get_index())
        rtr = pt.Experiment([br], vaswani.get_topics().head(10), vaswani.get_qrels(), ["map", "ndcg"], perquery=True)
        print(rtr)

        rtr = pt.Experiment([br], vaswani.get_topics().head(10), vaswani.get_qrels(), ["map", "ndcg"], perquery=True, dataframe=False)
        print(rtr)
示例#28
0
    def test_qe(self):
        if not pt.check_version("5.3"):
            self.skipTest("Requires Terrier 5.3")
        dataset = pt.datasets.get_dataset("vaswani")
        indexref = dataset.get_index()
        index = pt.IndexFactory.of(indexref)
        # given their defaults, there three expressions are identical, all use Bo1
        qe1 = pt.rewrite.QueryExpansion(index)
        qe2 = pt.rewrite.DFRQueryExpansion(index)
        qe3 = pt.rewrite.Bo1QueryExpansion(index)

        # lets go faster, we only need 18 topics. qid 16 had a tricky case
        t = dataset.get_topics().head(18)

        qrels = dataset.get_qrels()

        for qe in [qe1, qe2, qe3]:
            br = pt.BatchRetrieve(index)

            queriesIn = pd.DataFrame([["1", "compact"]],
                                     columns=["qid", "query"])
            res = br.transform(queriesIn)

            queriesOut = qe.transform(res)
            self.assertEqual(len(queriesOut), 1)
            self.assertTrue("query_0" in queriesOut.columns)
            self.assertEqual(queriesOut.iloc[0]["query_0"], "compact")
            query = queriesOut.iloc[0]["query"]
            self.assertTrue("compact^1.82230972" in query)
            self.assertTrue("applypipeline:off " in query)

            pipe = br >> qe >> br

            # check the pipe doesnt cause an error
            str(pipe)

            all_qe_res = pipe.transform(t)
            map_pipe = pt.Utils.evaluate(all_qe_res, qrels,
                                         metrics=["map"])["map"]

            br_qe = pt.BatchRetrieve(indexref, controls={"qe": "on"})
            map_qe = pt.Utils.evaluate(br_qe.transform(t),
                                       qrels,
                                       metrics=["map"])["map"]

            self.assertAlmostEqual(map_qe, map_pipe, places=4)
示例#29
0
 def test_save_trec_generator(self):
     br = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(),
                           wmodel="TF_IDF")
     filepath = os.path.join(self.test_dir, "test.res")
     pt.io.write_results(br.transform_gen(
         pt.get_dataset("vaswani").get_topics().head()),
                         filepath,
                         format="trec")
示例#30
0
 def test_parallel_joblib_experiment_br_callback(self):
     self.skip_windows()
     dataset = pt.get_dataset("vaswani")
     Tf = lambda keyFreq, posting, entryStats, collStats: posting.getFrequency(
     )
     br = pt.BatchRetrieve(dataset.get_index(), wmodel=Tf)
     df = pt.Experiment([br, br.parallel(3)], dataset.get_topics(),
                        dataset.get_qrels(), ["map", "mrt"])
     self.assertEqual(df.iloc[0]["map"], df.iloc[1]["map"])