Пример #1
0
    def test_qe(self):
        if not pt.check_version("5.3"):
            self.skipTest("Requires Terrier 5.3")
        dataset = pt.datasets.get_dataset("vaswani")
        indexref = dataset.get_index()

        qe = pt.rewrite.QueryExpansion(indexref)
        br = pt.BatchRetrieve(indexref)

        queriesIn = pd.DataFrame([["1", "compact"]], columns=["qid", "query"])
        res = br.transform(queriesIn)

        queriesOut = qe.transform(res)
        self.assertEqual(len(queriesOut), 1)
        query = queriesOut.iloc[0]["query"]
        self.assertTrue("compact^1.82230972" in query)
        self.assertTrue("applypipeline:off " in query)
        
        pipe = br >> qe >> br

        # lets go faster, we only need 18 topics. qid 16 had a tricky case
        t = dataset.get_topics().head(18)

        all_qe_res = pipe.transform(t)
        map_pipe = pt.Utils.evaluate(all_qe_res, dataset.get_qrels(), metrics=["map"])["map"]

        br_qe = pt.BatchRetrieve(indexref, controls={"qe":"on"})
        map_qe = pt.Utils.evaluate(br_qe.transform(t), dataset.get_qrels(), metrics=["map"])["map"]

        self.assertAlmostEqual(map_qe, map_pipe, places=4)
Пример #2
0
    def test_fbr_reranking(self):
        if not pt.check_version("5.3"):
            self.skipTest("Requires Terrier 5.3")
        # this test examines the use of ScoringMatchingWithFat
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        # we only want a candidate set of 2 documents
        firstpass = pt.BatchRetrieve(indexref, wmodel="BM25") % 2
        pipe = firstpass >> pt.FeaturesBatchRetrieve(
            indexref, features=["WMODEL:DPH", "WMODEL:PL2"])
        input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
        result = pipe.transform(input)
        self.assertTrue("qid" in result.columns)
        self.assertTrue("docno" in result.columns)
        self.assertTrue("score" in result.columns)
        self.assertTrue("features" in result.columns)
        self.assertEqual(2, len(result))
        self.assertEqual(result.iloc[0]["features"].size, 2)

        pipe_simple = firstpass >> (pt.BatchRetrieve(indexref, wmodel="DPH")**
                                    pt.BatchRetrieve(indexref, wmodel="PL2"))
        result2 = pipe.transform(input)
        import numpy as np
        f1 = np.stack(result["features"].values)
        f2 = np.stack(result2["features"].values)
        self.assertTrue(np.array_equal(f1, f2))
Пример #3
0
    def test_threading_selfupgrade(self):
        if not pt.check_version("5.5"):
            self.skipTest("Requires Terrier 5.5")

        topics = pt.get_dataset("vaswani").get_topics().head(10)

        #this test ensures we can upgrade the indexref to be concurrent
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        retr = pt.BatchRetrieve(indexref, threads=5)
        result = retr.transform(topics)
Пример #4
0
    def test_2_docs_title_body_meta(self):
        sample_dir = os.path.join(self.here, "fixtures", "sample_docs")
        files = pt.io.find_files(sample_dir)
        indexer = pt.FilesIndexer(self.test_dir,
                                  meta={
                                      "docno": 20,
                                      "filename": 512,
                                      "title": 20,
                                      "body": 40
                                  },
                                  meta_tags={
                                      "title": "title",
                                      "body": "ELSE"
                                  })
        indexref = indexer.index(files)
        index = pt.IndexFactory.of(indexref)

        # check index size
        self.assertEqual(
            2,
            index.getCollectionStatistics().getNumberOfDocuments())
        print(index.getMetaIndex().getAllItems(0))
        print(index.getMetaIndex().getAllItems(1))

        # determine file locations as docids
        html_file = os.path.join(sample_dir, "a.html")
        html_pos = files.index(html_file)
        txt_file = os.path.join(sample_dir, "b.txt")
        txt_pos = files.index(txt_file)

        self.assertTrue(html_pos < len(files))
        # test filename -> docid lookup
        self.assertEqual(
            html_pos,
            index.getMetaIndex().getDocument("filename", html_file))
        # test docid -> filename lookup
        self.assertEqual(html_file,
                         index.getMetaIndex().getItem("filename", html_pos))
        # test title has been recorded in metaindex
        self.assertEqual("test title",
                         index.getMetaIndex().getItem("title", html_pos))

        from pyterrier import check_version
        if not check_version("5.5"):
            return

        # test bodies have been recorded in metaindex
        self.assertEqual("test body",
                         index.getMetaIndex().getItem("body", html_pos))
        self.assertEqual("empty text document",
                         index.getMetaIndex().getItem("body", txt_pos))
Пример #5
0
    def test_num_results(self):
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        retr = pt.BatchRetrieve(indexref, num_results=10)
        input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
        result = retr.transform(input)
        self.assertEqual(len(result), 10)

        if not pt.check_version("5.5"):
            return

        retr = pt.BatchRetrieve(indexref, num_results=1001)
        result = retr.search("results")
        self.assertEqual(len(result), 1001)
Пример #6
0
    def test_fbr_reranking2(self):
        if not pt.check_version("5.4"):
            self.skipTest("Requires Terrier 5.4")
        # this test examines the use of ScoringMatchingWithFat, using a particular case known to with Terrier 5.3
        JIR = pt.Class('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        # we only want a candidate set of 3 documents
        firstpass = pt.BatchRetrieve(indexref, wmodel="BM25") % 3
        pipe1 = firstpass >> pt.FeaturesBatchRetrieve(indexref,
                                                      features=["WMODEL:PL2"])
        pipe2 = firstpass >> pt.BatchRetrieve(indexref, wmodel="PL2")

        input = pd.DataFrame([["1", "Stability"]], columns=['qid', 'query'])
        result0 = firstpass.transform(input)
        result1 = pipe1.transform(input)
        result2 = pipe2.transform(input)

        result1["feature0"] = result1.apply(lambda row: row["features"][0],
                                            axis=1)
        #BM25 score
        result0_map = {row.docno: row.score for row in result0.itertuples()}
        result1S_map = {row.docno: row.score for row in result1.itertuples()}
        #PL2 score
        result1F_map = {
            row.docno: row.feature0
            for row in result1.itertuples()
        }
        result2_map = {row.docno: row.score for row in result2.itertuples()}

        print(result1F_map)
        print(result2_map)

        # check features scores
        # NB: places can go no less than 4, as two documents have similar PL2 scores
        for rank, row in enumerate(result0.itertuples()):
            docno = row.docno
            # check that score is unchanged
            self.assertAlmostEqual(
                result1S_map[docno],
                result0_map[docno],
                msg="input score mismatch at rank %d for docno %s" %
                (rank, docno),
                places=4)
            #  check that feature score is correct
            self.assertAlmostEqual(
                result1F_map[docno],
                result2_map[docno],
                msg="feature score mismatch at rank %d for docno %s" %
                (rank, docno),
                places=4)
Пример #7
0
    def test_candidate_set_two_doc(self):
        if not pt.check_version("5.3"):
            self.skipTest("Requires Terrier 5.3")

        indexloc = self.here + "/fixtures/index/data.properties"
        # docid 50 == docno 51
        # docid 66 == docno 67

        input_set = pd.DataFrame([["q1", "light", 50], ["q1", None, 66]],
                                 columns=["qid", "query", "docid"])
        retr = pt.BatchRetrieve(indexloc)
        result = retr.transform(input_set)
        self.assertTrue("qid" in result.columns)
        self.assertTrue("docno" in result.columns)
        self.assertTrue("score" in result.columns)
        self.assertEqual(2, len(result))
Пример #8
0
    def test_qe(self):
        if not pt.check_version("5.3"):
            self.skipTest("Requires Terrier 5.3")
        dataset = pt.datasets.get_dataset("vaswani")
        indexref = dataset.get_index()
        index = pt.IndexFactory.of(indexref)
        # given their defaults, there three expressions are identical, all use Bo1
        qe1 = pt.rewrite.QueryExpansion(index)
        qe2 = pt.rewrite.DFRQueryExpansion(index)
        qe3 = pt.rewrite.Bo1QueryExpansion(index)

        # lets go faster, we only need 18 topics. qid 16 had a tricky case
        t = dataset.get_topics().head(18)

        qrels = dataset.get_qrels()

        for qe in [qe1, qe2, qe3]:
            br = pt.BatchRetrieve(index)

            queriesIn = pd.DataFrame([["1", "compact"]],
                                     columns=["qid", "query"])
            res = br.transform(queriesIn)

            queriesOut = qe.transform(res)
            self.assertEqual(len(queriesOut), 1)
            self.assertTrue("query_0" in queriesOut.columns)
            self.assertEqual(queriesOut.iloc[0]["query_0"], "compact")
            query = queriesOut.iloc[0]["query"]
            self.assertTrue("compact^1.82230972" in query)
            self.assertTrue("applypipeline:off " in query)

            pipe = br >> qe >> br

            # check the pipe doesnt cause an error
            str(pipe)

            all_qe_res = pipe.transform(t)
            map_pipe = pt.Utils.evaluate(all_qe_res, qrels,
                                         metrics=["map"])["map"]

            br_qe = pt.BatchRetrieve(indexref, controls={"qe": "on"})
            map_qe = pt.Utils.evaluate(br_qe.transform(t),
                                       qrels,
                                       metrics=["map"])["map"]

            self.assertAlmostEqual(map_qe, map_pipe, places=4)
Пример #9
0
    def test_threading_manualref(self):

        if not pt.check_version("5.5"):
            self.skipTest("Requires Terrier 5.5")

        topics = pt.get_dataset("vaswani").get_topics().head(10)

        #this test ensures that we operate when the indexref is specified to be concurrent
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of("concurrent:" + self.here +
                          "/fixtures/index/data.properties")
        retr = pt.BatchRetrieve(indexref, threads=5)
        result = retr.transform(topics)

        #check that use of a callback model works under threading
        Tf = lambda keyFreq, posting, entryStats, collStats: posting.getFrequency(
        )
        retr = pt.BatchRetrieve(indexref, threads=5, wmodel=Tf)
        result = retr.transform(topics)
Пример #10
0
 def test_candidate_set_one_doc(self):
     if not pt.check_version("5.3"):
         self.skipTest("Requires Terrier 5.3")
     indexloc = self.here + "/fixtures/index/data.properties"
     # docid 50 == docno 51
     input_set = pd.DataFrame([["q1", "light", 50]], columns=["qid", "query", "docid"])
     retr = pt.BatchRetrieve(indexloc)
     
     # this test the implementation of __call__() redirecting to transform()
     for result in [retr.transform(input_set), retr(input_set)]:
         result = retr.transform(input_set)
         self.assertTrue("qid" in result.columns)
         self.assertTrue("docno" in result.columns)
         self.assertTrue("score" in result.columns)
         self.assertEqual(1, len(result))
         row = result.iloc[0]
         self.assertEqual("q1", row["qid"])
         self.assertEqual("51", row["docno"])
         self.assertTrue(row["score"] > 0)
Пример #11
0
 def test_sdm(self):
     if not pt.check_version("5.3"):
         self.skipTest("Requires Terrier 5.3")
     self._sdm(False)