def test_TREC_indexing_bad_files_type(self):
        print("Writing index to " + self.test_dir)
        indexer = pt.TRECCollectionIndexer(self.test_dir)
        with self.assertRaises(ValueError):
            indexRef = indexer.index(5)

        indexer = pt.TRECCollectionIndexer(self.test_dir)
        with self.assertRaises(ValueError):
            indexRef = indexer.index(
                pt.get_dataset("vaswani").get_corpus_iter())
示例#2
0
    def _sdm(self, freq):
        dataset = pt.datasets.get_dataset("vaswani")
        indexer = pt.TRECCollectionIndexer(self.test_dir, blocks=True)
        indexref = indexer.index(dataset.get_corpus())

        if freq:
            sdm = pt.rewrite.SDM(prox_model="org.terrier.matching.models.Tf")
        else:
            sdm = pt.rewrite.SDM()
       
        queriesIn = pd.DataFrame([["1", "compact"], ["2", "compact memories"]], columns=["qid", "query"])
        queriesOut = sdm.transform(queriesIn)
        self.assertEqual(len(queriesOut), 2)
        self.assertEqual(queriesOut.iloc[0]["query"], "compact")
        query2 = queriesOut.iloc[1]["query"]
        self.assertTrue("#1" in query2)
        self.assertTrue("#uw8" in query2)
        self.assertTrue("#combine" in query2)
        
        br_normal = pt.BatchRetrieve(indexref)
        pipe = sdm >> br_normal
        if freq:
            br_normal.controls["wmodel"] = "Tf"
        resTest_pipe = pipe.transform(queriesIn)


        # this BR should do the same thing as the pipe, but natively in Terrier
        br_sdm = pt.BatchRetrieve(indexref,
            controls = {"sd" :"on"}, 
            properties={"querying.processes" : "terrierql:TerrierQLParser,parsecontrols:TerrierQLToControls,"\
                    +"parseql:TerrierQLToMatchingQueryTerms,matchopql:MatchingOpQLParser,applypipeline:ApplyTermPipeline,"\
                    +"sd:DependenceModelPreProcess,localmatching:LocalManager$ApplyLocalMatching,qe:QueryExpansion,"\
                    +"labels:org.terrier.learning.LabelDecorator,filters:LocalManager$PostFilterProcess"})
        if freq:
            br_sdm.controls["wmodel"] = "Tf"
            br_sdm.controls["dependencemodel"] = "org.terrier.matching.models.Tf"

        resTest_native = br_sdm.transform(queriesIn)
 
        #print (resTest_pipe[resTest_pipe["qid"]=="2"])
        #print (resTest_native[resTest_native["qid"]=="2"])
        for index, row in resTest_pipe.iterrows():
            #print(index)
            #print(row["query"])
            #print(row)
            #print(resTest_native.iloc[index]) 
            self.assertEqual(row['qid'], resTest_native.iloc[index]["qid"])
            self.assertEqual(row['docno'], resTest_native.iloc[index]["docno"])
            # TODO I cannot get this test to pass with freq=False more precisely than 1dp
            #9.165638 in resTest_pipe vs 9.200683 in resTest_native
            self.assertAlmostEqual(row['score'], resTest_native.iloc[index]["score"], 1)

        t = dataset.get_topics().head(5)
        pipe_res = pipe.transform(t)
        #br_normal.saveResult(pipe_res, "/tmp/sdm.res", run_name="DPH")

        self.assertAlmostEqual(
            pt.Utils.evaluate(pipe_res, dataset.get_qrels(), metrics=["map"])["map"], 
            pt.Utils.evaluate(br_sdm.transform(t), dataset.get_qrels(), metrics=["map"])["map"], 
            places=4)
示例#3
0
    def test_scoring_text(self):
        pt.logging("DEBUG")
        dataset = pt.get_dataset("vaswani")
        indexer = pt.TRECCollectionIndexer(
            self.test_dir,
            meta={
                'docno': 26,
                'body': 2048
            },
            # The tags from which to save the text. ELSE is special tag name, which means anything not consumed by other tags.
            meta_tags={'body': 'ELSE'})
        indexref = indexer.index(dataset.get_corpus())
        index = pt.IndexFactory.of(indexref)
        meta = index.getMetaIndex()
        self.assertTrue("body" in meta.getKeys())
        self.assertTrue("compact memories have" in meta.getItem("body", 0))
        print(meta.getItem("body", 1047))

        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$Constant")
        self._test_scoring_text(dataset, index, "Tf")
        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$TFOverN")
        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$F")
        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$Nt")
        self._test_scoring_text(dataset, index, "DPH")
示例#4
0
    def test_scoring_text(self):
        pt.logging("DEBUG")
        dataset = pt.get_dataset("vaswani")
        indexer = pt.TRECCollectionIndexer(self.test_dir)
        indexer.setProperties(
            **{
                "TaggedDocument.abstracts": "body",
                # The tags from which to save the text. ELSE is special tag name, which means anything not consumed by other tags.
                "TaggedDocument.abstracts.tags": "ELSE",
                # The max lengths of the abstracts. Abstracts will be cropped to this length. Defaults to empty.
                "TaggedDocument.abstracts.lengths": "2048",
                "indexer.meta.forward.keys": "docno,body",
                "indexer.meta.forward.keylens": "26,2048"
            })
        indexref = indexer.index(dataset.get_corpus())
        index = pt.IndexFactory.of(indexref)
        meta = index.getMetaIndex()
        self.assertTrue("body" in meta.getKeys())
        self.assertTrue("compact memories have" in meta.getItem("body", 0))
        print(meta.getItem("body", 1047))

        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$Constant")
        self._test_scoring_text(dataset, index, "Tf")
        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$TFOverN")
        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$F")
        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$Nt")
        self._test_scoring_text(dataset, index, "DPH")
示例#5
0
 def test_TREC_indexing_memory(self):
     indexer = pt.TRECCollectionIndexer(self.test_dir,
                                        type=pt.IndexingType.MEMORY)
     indexRef = indexer.index(
         pt.io.find_files(self.here + "/fixtures/vaswani_npl/corpus/"))
     self.assertIsNotNone(indexRef)
     index = pt.IndexFactory.of(indexRef)
     self.assertEqual(
         11429,
         index.getCollectionStatistics().getNumberOfDocuments())
示例#6
0
 def test_TREC_indexing(self):
     indexer = pt.TRECCollectionIndexer(self.test_dir)
     indexRef = indexer.index(
         pt.io.find_files(self.here + "/fixtures/vaswani_npl/corpus/"))
     self.assertIsNotNone(indexRef)
     index = pt.IndexFactory.of(indexRef)
     self.assertEqual(
         11429,
         index.getCollectionStatistics().getNumberOfDocuments())
     self.assertTrue(os.path.isfile(self.test_dir + '/data.direct.bf'))
 def test_TREC_indexing_singlepass(self):
     indexer = pt.TRECCollectionIndexer(self.test_dir,
                                        type=pt.IndexingType.SINGLEPASS)
     indexRef = indexer.index(
         pt.Utils.get_files_in_dir(self.here +
                                   "/fixtures/vaswani_npl/corpus/"))
     self.assertIsNotNone(indexRef)
     index = pt.IndexFactory.of(indexRef)
     self.assertEqual(
         11429,
         index.getCollectionStatistics().getNumberOfDocuments())
     self.assertFalse(os.path.isfile(self.test_dir + '/data.direct.bf'))
示例#8
0
    def __init__(self, index_path, stemmer, corpus):      
        self.index_path = index_path
        
        # Get the corpus
        self.corpus = corpus
        self.indexer = pt.TRECCollectionIndexer(index_path)

        # Set the stemmer, and don't add stopwords
        index_props = None
        if stemmer=="snowball":
            index_props = {"termpipelines": "EnglishSnowballStemmer"}
        elif stemmer == "porter":
            index_props = {"termpipelines": "PorterStemmer"}
        else: # No Stemmer
            index_props = {"termpipelines": "NoOp"}
        self.indexer.setProperties(**index_props)
 def test_TREC_indexing_revmeta(self):
     print("Writing index to " + self.test_dir)
     indexer = pt.TRECCollectionIndexer(self.test_dir,
                                        meta_reverse=['docno'])
     indexRef = indexer.index(
         pt.io.find_files(self.here + "/fixtures/vaswani_npl/corpus/"))
     self.assertIsNotNone(indexRef)
     index = pt.IndexFactory.of(indexRef)
     self.assertEqual(
         11429,
         index.getCollectionStatistics().getNumberOfDocuments())
     self.assertTrue(os.path.isfile(self.test_dir + '/data.direct.bf'))
     meta = index.getMetaIndex()
     self.assertTrue('docno' in meta.getReverseKeys())
     self.assertEqual(meta.getDocument("docno", meta.getItem("docno", 2)),
                      2)
 def test_TREC_indexing_text(self):
     print("Writing index to " + self.test_dir)
     indexer = pt.TRECCollectionIndexer(
         self.test_dir,
         meta={
             'docno': 26,
             'body': 2048
         },
         # The tags from which to save the text. ELSE is special tag name, which means anything not consumed by other tags.
         meta_tags={'body': 'ELSE'})
     indexRef = indexer.index(
         pt.io.find_files(self.here + "/fixtures/vaswani_npl/corpus/"))
     self.assertIsNotNone(indexRef)
     index = pt.IndexFactory.of(indexRef)
     self.assertTrue("body" in index.getMetaIndex().getKeys())
     self.assertTrue("compact memories have flexible capacities" in
                     index.getMetaIndex().getItem("body", 0))
     self.assertEqual(
         11429,
         index.getCollectionStatistics().getNumberOfDocuments())
     self.assertTrue(os.path.isfile(self.test_dir + '/data.direct.bf'))
topics_train_path = os.path.join(topics_path, 'queries.train.tsv')
topics_dev_path = os.path.join(topics_path, 'queries.dev.tsv')
topics_eval_path = os.path.join(topics_path, 'queries.eval.tsv')
topics_eval19_path = os.path.join(os.getcwd(), 'data19', 'queries.eval.tsv')

# read data into dataframes from paths
topics_train = pt.io.read_topics(topics_train_path, format='singleline')
topics_dev = pt.io.read_topics(topics_dev_path, format='singleline')
topics_eval = pt.io.read_topics(topics_eval_path, format='singleline')
topics_eval19 = pt.io.read_topics(topics_eval19_path, format='singleline')

qrels_train = pt.io.read_qrels(qrels_train_path)
qrels_dev = pt.io.read_qrels(qrels_dev_path)
qrels_eval19 = pt.io.read_qrels(qrels_eval19_path)

indexRef = pt.TRECCollectionIndexer(index_path)


def fill_empty_queries(df: pd.DataFrame):
    """
    fills all empty queries with some text so that pyterrier_bert does not crash.
    """
    df_copy = df.copy()
    df_copy.loc[df_copy['query'].str.len() == 0, 'query'] = 'nova'
    return df_copy


topics_train = fill_empty_queries(topics_train)
topics_dev = fill_empty_queries(topics_dev)
topics_eval = fill_empty_queries(topics_eval)
topics_eval19 = fill_empty_queries(topics_eval19)