Exemplo n.º 1
0
 def test_read_results_topic_merging(self):
     orig_results = pd.DataFrame({
         'qid': ['1', '1', '2'],
         'docno': ['A', 'B', 'C'],
         'score': [0.8, 0.4, 0.6],
         'rank': [1, 2, 1]
     })
     pt.io.write_results(orig_results, 'test.res')
     for results in [
             pt.io.read_results('test.res', dataset='vaswani'),
             pt.io.read_results('test.res',
                                dataset=pt.get_dataset('vaswani')),
             pt.io.read_results(
                 'test.res', topics=pt.get_dataset('vaswani').get_topics()),
     ]:
         self.assertEqual(
             results.iloc[0].query,
             'measurement of dielectric constant of liquids by the use of microwave techniques'
         )
         self.assertEqual(
             results.iloc[1].query,
             'measurement of dielectric constant of liquids by the use of microwave techniques'
         )
         self.assertEqual(
             results.iloc[2].query,
             'mathematical analysis and design details of waveguide fed microwave radiations'
         )
Exemplo n.º 2
0
    def test_cache_compose_cache(self):
        pt.cache.CACHE_DIR = self.test_dir
        import pandas as pd
        queries = pd.DataFrame([["q1", "chemical"]], columns=["qid", "query"])
        br1 = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(), wmodel="TF_IDF")
        br2 = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(), wmodel="BM25")
        cache = ~ (~br1 >> br2)
        self.assertEqual(0, len(cache.chest._keys))
        cache(queries)
        cache(queries)
        self.assertEqual(0.5, cache.stats())

        #lets see if another cache of the same object would see the same cache entries.
        cache2 = ~(~br1 >> br2)
        cache2(queries)
        self.assertEqual(1, cache2.stats())
        
        # check that the cache report works
        all_report = pt.cache.list_cache()
        self.assertTrue(len(all_report) > 0)
        report = list(all_report.values())[0]
        self.assertEqual(1, report["queries"])
        self.assertTrue("transformer" in report)
        self.assertTrue("size" in report)
        self.assertTrue("lastmodified" in report)
        
        pt.cache.CACHE_DIR = None
Exemplo n.º 3
0
 def test_save_trec_generator(self):
     br = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(),
                           wmodel="TF_IDF")
     filepath = os.path.join(self.test_dir, "test.res")
     pt.io.write_results(br.transform_gen(
         pt.get_dataset("vaswani").get_topics().head()),
                         filepath,
                         format="trec")
Exemplo n.º 4
0
 def test_fetch_text_docid(self):
     dfinput = pd.DataFrame([["q1", "a query", 1]], columns=["qid", "query", "docid"])
     #directory, indexref, str, Index
     for indexlike in [
         pt.get_dataset("vaswani").get_index(), 
         pt.IndexRef.of(pt.get_dataset("vaswani").get_index()),
         pt.IndexRef.of(pt.get_dataset("vaswani").get_index()).toString(),
         pt.IndexFactory.of(pt.get_dataset("vaswani").get_index())
     ]:
         textT = pt.text.get_text(indexlike, "docno")
         self.assertTrue(isinstance(textT, pt.transformer.TransformerBase))
         dfOut = textT.transform(dfinput)
         self.assertTrue(isinstance(dfOut, pd.DataFrame))
         self.assertTrue("docno" in dfOut.columns)
    def test_variants(self):
        dataset = pt.get_dataset('irds:clueweb09/catb/trec-web-2009')

        with self.subTest('all fields'):
            topics = dataset.get_topics()
            self.assertEqual(
                ['qid', 'query', 'description', 'type', 'subtopics'],
                list(topics.columns))

        with self.subTest('specific field'):
            topics = dataset.get_topics('description')
            self.assertEqual(['qid', 'query'], list(
                topics.columns))  # description mapped to query
            self.assertEqual(
                topics.iloc[0]['query'],
                'find information on president barack obama s family history including genealogy national origins places and dates of birth etc'
            )

        with self.subTest('specific field'):
            topics = dataset.get_topics('description', tokenise_query=False)
            self.assertEqual(['qid', 'query'], list(
                topics.columns))  # description mapped to query
            self.assertEqual(
                topics.iloc[0]['query'],
                "Find information on President Barack Obama's family\n  history, including genealogy, national origins, places and dates of\n  birth, etc.\n  "
            )

        with self.subTest('field named query'):
            topics = dataset.get_topics('query')
            self.assertEqual(['qid', 'query'], list(topics.columns))
            self.assertEqual(topics.iloc[0]['query'], 'obama family tree')

        with self.assertRaises(AssertionError):
            dataset.get_topics('field_that_does_not_exist')
Exemplo n.º 6
0
    def test_save_docs_CE(self):
        index = pt.get_dataset("vaswani").get_index()
        dph = pt.BatchRetrieve(index, wmodel="DPH")
        pipe = dph \
            >> pt.rewrite.stash_results() \
            >> pt.BatchRetrieve(index, wmodel="BM25") \
            >> pt.rewrite.Bo1QueryExpansion(index) \
            >> pt.rewrite.reset_results() \
            >> dph
        rtr1 = dph.search("chemical reactions")
        rtr2 = pipe.search("chemical reactions")
        # Bo1 should be applied as a re-ranker, hence the
        # number of docs in rtr1 and rtr2 should be equal
        self.assertEqual(len(rtr1), len(rtr2))

        # check columns are passed through where we expect
        pipeP3 = dph \
            >> pt.rewrite.stash_results() \
            >> pt.BatchRetrieve(index, wmodel="BM25")
        res3 = pipeP3.search("chemical reactions")
        self.assertIn("stashed_results_0", res3.columns)
        pipeP4 = dph \
            >> pt.rewrite.stash_results() \
            >> pt.BatchRetrieve(index, wmodel="BM25") \
            >> pt.rewrite.Bo1QueryExpansion(index)
        res4 = pipeP3.search("chemical reactions")
        self.assertIn("stashed_results_0", res4.columns)
Exemplo n.º 7
0
    def test_scoring_text(self):
        pt.logging("DEBUG")
        dataset = pt.get_dataset("vaswani")
        indexer = pt.TRECCollectionIndexer(
            self.test_dir,
            meta={
                'docno': 26,
                'body': 2048
            },
            # The tags from which to save the text. ELSE is special tag name, which means anything not consumed by other tags.
            meta_tags={'body': 'ELSE'})
        indexref = indexer.index(dataset.get_corpus())
        index = pt.IndexFactory.of(indexref)
        meta = index.getMetaIndex()
        self.assertTrue("body" in meta.getKeys())
        self.assertTrue("compact memories have" in meta.getItem("body", 0))
        print(meta.getItem("body", 1047))

        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$Constant")
        self._test_scoring_text(dataset, index, "Tf")
        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$TFOverN")
        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$F")
        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$Nt")
        self._test_scoring_text(dataset, index, "DPH")
Exemplo n.º 8
0
 def test_parallel_joblib_experiment(self):
     self.skip_windows()
     dataset = pt.get_dataset("vaswani")
     br = pt.BatchRetrieve(dataset.get_index())
     df = pt.Experiment([br, br.parallel(3)], dataset.get_topics(),
                        dataset.get_qrels(), ["map", "mrt"])
     self.assertEqual(df.iloc[0]["map"], df.iloc[1]["map"])
Exemplo n.º 9
0
    def test_scoring_text(self):
        pt.logging("DEBUG")
        dataset = pt.get_dataset("vaswani")
        indexer = pt.TRECCollectionIndexer(self.test_dir)
        indexer.setProperties(
            **{
                "TaggedDocument.abstracts": "body",
                # The tags from which to save the text. ELSE is special tag name, which means anything not consumed by other tags.
                "TaggedDocument.abstracts.tags": "ELSE",
                # The max lengths of the abstracts. Abstracts will be cropped to this length. Defaults to empty.
                "TaggedDocument.abstracts.lengths": "2048",
                "indexer.meta.forward.keys": "docno,body",
                "indexer.meta.forward.keylens": "26,2048"
            })
        indexref = indexer.index(dataset.get_corpus())
        index = pt.IndexFactory.of(indexref)
        meta = index.getMetaIndex()
        self.assertTrue("body" in meta.getKeys())
        self.assertTrue("compact memories have" in meta.getItem("body", 0))
        print(meta.getItem("body", 1047))

        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$Constant")
        self._test_scoring_text(dataset, index, "Tf")
        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$TFOverN")
        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$F")
        self._test_scoring_text(dataset, index,
                                "org.terrier.python.TestModel$Nt")
        self._test_scoring_text(dataset, index, "DPH")
Exemplo n.º 10
0
 def test_monot5_vaswani(self):
     if not pt.started():
         pt.init()
     bm25 = pt.BatchRetrieve(pt.get_dataset('vaswani').get_index(),
                             wmodel='BM25')
     monoT5 = pyterrier_t5.MonoT5ReRanker()
     pipeline = bm25 % 20 >> pt.text.get_text(
         pt.get_dataset('irds:vaswani'), 'text') >> monoT5
     result = pipeline.search('fluid dynamics')
     self.assertEqual(result.iloc[0]['docno'], '11216')
     self.assertAlmostEqual(result.iloc[0]['score'], -2.186261, places=4)
     self.assertEqual(result.iloc[0]['rank'], 0)
     self.assertEqual(result.iloc[1]['docno'], '5299')
     self.assertAlmostEqual(result.iloc[1]['score'], -8.078399, places=4)
     self.assertEqual(result.iloc[1]['rank'], 1)
     self.assertEqual(result.iloc[-1]['docno'], '3442')
     self.assertAlmostEqual(result.iloc[-1]['score'], -12.725513, places=4)
     self.assertEqual(result.iloc[-1]['rank'], 19)
Exemplo n.º 11
0
    def test_cache_compose(self):
        pt.cache.CACHE_DIR = self.test_dir
        import pandas as pd
        queries = pd.DataFrame([["q1", "chemical"]], columns=["qid", "query"])
        br1 = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(), wmodel="TF_IDF")
        br2 = pt.BatchRetrieve(pt.get_dataset("vaswani").get_index(), wmodel="BM25")
        cache = ~ (br1 >> br2)
        self.assertEqual(0, len(cache.chest._keys))
        cache(queries)
        cache(queries)
        self.assertEqual(0.5, cache.stats())

        #lets see if another cache of the same object would see the same cache entries.
        cache2 = ~(br1 >> br2)
        cache2(queries)
        self.assertEqual(1, cache2.stats())

        pt.cache.CACHE_DIR = None
Exemplo n.º 12
0
 def test_sliding(self):
     slider = pt.text.sliding("text", 10, 10, prepend_attr=None)
     indexer = pt.IterDictIndexer(self.test_dir)
     pipeline = slider >> indexer
     dataset = pt.get_dataset("irds:vaswani")
     indexref = pipeline.index(dataset.get_corpus_iter())
     self.assertIsNotNone(indexref)
     index = pt.IndexFactory.of(indexref)
     self.assertTrue( index.getCollectionStatistics().getNumberOfDocuments() > len(dataset.get_corpus_iter()))
Exemplo n.º 13
0
 def test_duot5_vaswani(self):
     if not pt.started():
         pt.init()
     bm25 = pt.BatchRetrieve(pt.get_dataset('vaswani').get_index(),
                             wmodel='BM25')
     duoT5 = pyterrier_t5.DuoT5ReRanker()
     pipeline = bm25 % 10 >> pt.text.get_text(
         pt.get_dataset('irds:vaswani'), 'text') >> duoT5
     result = pipeline.search('fluid dynamics')
     self.assertEqual(result.iloc[0]['docno'], '9731')
     self.assertAlmostEqual(result.iloc[0]['score'], 44.621585, places=4)
     self.assertEqual(result.iloc[0]['rank'], 0)
     self.assertEqual(result.iloc[1]['docno'], '7045')
     self.assertAlmostEqual(result.iloc[1]['score'], 27.716750, places=4)
     self.assertEqual(result.iloc[1]['rank'], 1)
     self.assertEqual(result.iloc[-1]['docno'], '4767')
     self.assertAlmostEqual(result.iloc[-1]['score'], -9.916206, places=4)
     self.assertEqual(result.iloc[-1]['rank'], 9)
Exemplo n.º 14
0
 def test_parallel_joblib_experiment_br_callback(self):
     self.skip_windows()
     dataset = pt.get_dataset("vaswani")
     Tf = lambda keyFreq, posting, entryStats, collStats: posting.getFrequency(
     )
     br = pt.BatchRetrieve(dataset.get_index(), wmodel=Tf)
     df = pt.Experiment([br, br.parallel(3)], dataset.get_topics(),
                        dataset.get_qrels(), ["map", "mrt"])
     self.assertEqual(df.iloc[0]["map"], df.iloc[1]["map"])
Exemplo n.º 15
0
    def test_TREC_indexing_bad_files_type(self):
        print("Writing index to " + self.test_dir)
        indexer = pt.TRECCollectionIndexer(self.test_dir)
        with self.assertRaises(ValueError):
            indexRef = indexer.index(5)

        indexer = pt.TRECCollectionIndexer(self.test_dir)
        with self.assertRaises(ValueError):
            indexRef = indexer.index(
                pt.get_dataset("vaswani").get_corpus_iter())
Exemplo n.º 16
0
 def test_baseline(self):
     dataset = pt.get_dataset("vaswani")
     df = pt.Experiment(
         [pt.BatchRetrieve(dataset.get_index(), wmodel="BM25"), pt.BatchRetrieve(dataset.get_index(), wmodel="DPH")], 
         dataset.get_topics().head(10), 
         dataset.get_qrels(),
         eval_metrics=["map", "ndcg"], 
         baseline=0)
     self.assertTrue("map +" in df.columns)
     self.assertTrue("map -" in df.columns)
     self.assertTrue("map p-value" in df.columns)
Exemplo n.º 17
0
    def test_threading_selfupgrade(self):
        if not pt.check_version("5.5"):
            self.skipTest("Requires Terrier 5.5")

        topics = pt.get_dataset("vaswani").get_topics().head(10)

        #this test ensures we can upgrade the indexref to be concurrent
        JIR = pt.autoclass('org.terrier.querying.IndexRef')
        indexref = JIR.of(self.here + "/fixtures/index/data.properties")
        retr = pt.BatchRetrieve(indexref, threads=5)
        result = retr.transform(topics)
Exemplo n.º 18
0
 def test_fetch_text_irds(self):
     dfinput = pd.DataFrame([["q1", "a query", "4"]],
                            columns=["qid", "query", "docno"])
     textT = pt.text.get_text(pt.get_dataset('irds:vaswani'), "text")
     self.assertTrue(isinstance(textT, pt.transformer.TransformerBase))
     dfOut = textT.transform(dfinput)
     self.assertTrue(isinstance(dfOut, pd.DataFrame))
     self.assertTrue("text" in dfOut.columns)
     self.assertTrue(
         "the british computer society  report of a conference held in cambridge\njune\n"
         in dfOut.iloc[0].text)
Exemplo n.º 19
0
 def test_parallel_joblib_ops(self):
     dataset = pt.get_dataset("vaswani")
     topics = dataset.get_topics().head(3)
     dph = pt.BatchRetrieve(dataset.get_index())
     tf = pt.BatchRetrieve(dataset.get_index(), wmodel="Tf")
     for pipe in [
             dph, dph % 10, dph >> tf, dph + tf,
             pt.apply.query(lambda row: row["query"] + " chemical") >> dph
     ]:
         res1 = pipe(topics)
         res2 = pipe.parallel(3)(topics)
         self.assertEqual(len(res1), len(res2))
Exemplo n.º 20
0
 def test_gridsearch(self):
     dataset = pt.get_dataset("vaswani")
     pipe = pt.BatchRetrieve(dataset.get_index(),
                             wmodel="PL2",
                             controls={'c': 1})
     rtr = pt.pipelines.GridSearch(pipe,
                                   {pipe: {
                                       'c': [0.1, 1, 5, 10, 20, 100]
                                   }},
                                   dataset.get_topics().head(5),
                                   dataset.get_qrels())
     self.assertEqual(100, rtr.get_parameter("c"))
Exemplo n.º 21
0
    def test_webtrack_cw09(self):
        import pyterrier as pt
        for k in [
                "trec-wt-2009", "trec-wt-2010", "trec-wt-2011", "trec-wt-2012"
        ]:
            ds = pt.get_dataset(k)
            topics = ds.get_topics()
            qrels = ds.get_qrels("adhoc")

            #check that the qrels match the topics.
            join = topics.merge(qrels, on=["qid"])
            self.assertTrue(len(join) > 0)
Exemplo n.º 22
0
 def test_gridscan_1param(self):
     dataset = pt.get_dataset("vaswani")
     pipe = pt.BatchRetrieve(dataset.get_index(),
                             wmodel="PL2",
                             controls={'c': 1})
     self.assertEqual(1, pipe.get_parameter('c'))
     rtr = pt.GridScan(pipe, {pipe: {
         'c': [0.1, 1, 5, 10, 20, 100]
     }},
                       dataset.get_topics().head(5),
                       dataset.get_qrels(),
                       dataframe=False)
     self.assertEqual(6, len(rtr))
Exemplo n.º 23
0
 def test_save_docs_QE(self):
     index = pt.get_dataset("vaswani").get_index()
     dph = pt.BatchRetrieve(index, wmodel="DPH")
     pipe = dph \
         >> pt.rewrite.stash_results(clear=False) \
         >> pt.rewrite.Bo1QueryExpansion(index) \
         >> pt.rewrite.reset_results() \
         >> dph
     rtr1 = dph.search("chemical reactions")
     rtr2 = pipe.search("chemical reactions")
     # Bo1 should be applied as a re-ranker, hence the
     # number of docs in rtr1 and rtr2 should be equal
     self.assertEqual(len(rtr1), len(rtr2))
Exemplo n.º 24
0
 def test_sliding_title_one(self):
     corpus = [{"docno": "d1", "text": "A B", "title": "this is a title"}]
     slider = pt.text.sliding("text", 2, 1, prepend_attr="title")
     indexer = pt.IterDictIndexer(self.test_dir)
     pipeline = slider >> indexer
     dataset = pt.get_dataset("irds:vaswani")
     indexref = pipeline.index(corpus)
     self.assertIsNotNone(indexref)
     index = pt.IndexFactory.of(indexref)
     # we should get 1 passages in the resulting index
     self.assertEqual(
         1,
         index.getCollectionStatistics().getNumberOfDocuments())
Exemplo n.º 25
0
    def test_baseline_and_tests(self):
        dataset = pt.get_dataset("vaswani")
        numt=10
        res1 = pt.BatchRetrieve(dataset.get_index(), wmodel="BM25")(dataset.get_topics().head(numt))
        res2 = pt.BatchRetrieve(dataset.get_index(), wmodel="DPH")(dataset.get_topics().head(numt))

        # t-test
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            df = pt.Experiment(
                [res1, res2], 
                dataset.get_topics().head(numt), 
                dataset.get_qrels(),
                eval_metrics=["map", "ndcg"], 
                baseline=0)
            self.assertTrue("map +" in df.columns)
            self.assertTrue("map -" in df.columns)
            self.assertTrue("map p-value" in df.columns)

            # wilcoxon signed-rank test
            df = pt.Experiment(
                [res1, res2], 
                dataset.get_topics().head(numt), 
                dataset.get_qrels(),
                eval_metrics=["map", "ndcg"], 
                test='wilcoxon', 
                baseline=0)
            self.assertTrue("map +" in df.columns)
            self.assertTrue("map -" in df.columns)
            self.assertTrue("map p-value" in df.columns)


        # user-specified TOST
        # TOST will omit warnings here, due to low numbers of topics
        import statsmodels.stats.weightstats
        fn = lambda X,Y: (0, statsmodels.stats.weightstats.ttost_ind(X, Y, -0.01, 0.01)[0])
        
        #This filter doesnt work
        with warnings.catch_warnings(record=True) as w:
            warnings.filterwarnings("always")
            df = pt.Experiment(
                [res1, res2], 
                dataset.get_topics().head(numt), 
                dataset.get_qrels(),
                eval_metrics=["map", "ndcg"], 
                test=fn,
                baseline=0)
            print(w)
        self.assertTrue("map +" in df.columns)
        self.assertTrue("map -" in df.columns)
        self.assertTrue("map p-value" in df.columns)
Exemplo n.º 26
0
 def test_baseline_corrected(self):
     dataset = pt.get_dataset("vaswani")
     for corr in ['hs', 'bonferroni', 'holm-sidak']:            
         df = pt.Experiment(
             [pt.BatchRetrieve(dataset.get_index(), wmodel="BM25"), pt.BatchRetrieve(dataset.get_index(), wmodel="DPH")], 
             dataset.get_topics().head(10), 
             dataset.get_qrels(),
             eval_metrics=["map", "ndcg"], 
             baseline=0, correction='hs')
         self.assertTrue("map +" in df.columns)
         self.assertTrue("map -" in df.columns)
         self.assertTrue("map p-value" in df.columns)
         self.assertTrue("map p-value corrected" in df.columns)
         self.assertTrue("map reject" in df.columns)
Exemplo n.º 27
0
 def test_fetch_text_irds(self):
     dfinput = pd.DataFrame([
         ["q1", "a query", "4"],
         ["q1", "a query", "1"],
         ["q1", "a query", "4"],
         ], columns=["qid", "query", "docno"])
     textT = pt.text.get_text(pt.get_dataset('irds:vaswani'), "text")
     self.assertTrue(isinstance(textT, pt.transformer.TransformerBase))
     dfOut = textT.transform(dfinput)
     self.assertTrue(isinstance(dfOut, pd.DataFrame))
     self.assertTrue("text" in dfOut.columns)
     self.assertTrue("the british computer society  report of a conference held in cambridge\njune\n" in dfOut.iloc[0].text)
     self.assertTrue("compact memories have flexible capacities  a digital data storage\nsystem with capacity up to bits and random and or sequential access\nis described\n" in dfOut.iloc[1].text)
     self.assertTrue("the british computer society  report of a conference held in cambridge\njune\n" in dfOut.iloc[2].text)
Exemplo n.º 28
0
    def test_scoring_manual_background(self):
        input = pd.DataFrame([["q1", "fox", "d1", "all the fox were fox"]],
                             columns=["qid", "query", "docno", "body"])
        from pyterrier.batchretrieve import TextScorer
        scorer = TextScorer(
            wmodel="Tf",
            background_index=pt.get_dataset("vaswani").get_index())
        rtr = scorer(input)
        self.assertEqual(1, len(rtr))
        self.assertTrue("score" in rtr.columns)
        self.assertEqual(2, rtr.iloc[0]["score"])

        index_background = pt.IndexFactory.of(
            pt.get_dataset("vaswani").get_index())
        scorer = TextScorer(wmodel="org.terrier.python.TestModel$TFOverN",
                            background_index=index_background)
        rtr = scorer(input)
        self.assertEqual(1, len(rtr))
        self.assertTrue("score" in rtr.columns)
        self.assertEqual(
            2 /
            index_background.getCollectionStatistics().getNumberOfDocuments(),
            rtr.iloc[0]["score"])
Exemplo n.º 29
0
    def test_webtrack_gov(self):
        import pyterrier as pt
        for k in ["trec-wt-2002", "trec-wt-2003", "trec-wt-2004"]:
            ds = pt.get_dataset(k)
            for t in ["np", "td", "hp"]:
                if k != "trec-wt-2004":
                    #HP finding only for the 2004 task?
                    continue
                topics = ds.get_topics(t)
                qrels = ds.get_qrels(t)

                #check that the qrels qid match the topics.
                join = topics.merge(qrels, on=["qid"])
                self.assertTrue(len(join) > 0)
Exemplo n.º 30
0
    def test_add_dup(self):
        def _first(df):
            df2 = df.copy()
            df2["docno"] = df2["docno"] + "bis" 
            return pd.concat([df, df2])

        slider = pt.apply.generic(_first)
        indexer = pt.IterDictIndexer(self.test_dir)
        pipeline = slider >> indexer
        dataset = pt.get_dataset("irds:vaswani")
        #print(next(dataset.get_corpus_iter().gen))
        indexref = pipeline.index(dataset.get_corpus_iter())
        self.assertIsNotNone(indexref)
        index = pt.IndexFactory.of(indexref)
        self.assertEqual( index.getCollectionStatistics().getNumberOfDocuments(),  2 * len(dataset.get_corpus_iter()))