Exemplo n.º 1
0
    def test_simple_fusion_searcher(self):
        index_dirs = [
            'lucene-index-cord19-abstract-2020-05-01/',
            'lucene-index-cord19-full-text-2020-05-01/',
            'lucene-index-cord19-paragraph-2020-05-01/'
        ]

        searcher = SimpleFusionSearcher(index_dirs, method=FusionMethod.RRF)

        runs, topics = [], get_topics('covid_round2')
        for topic in tqdm(sorted(topics.keys())):
            query = topics[topic]['question'] + ' ' + topics[topic]['query']
            hits = searcher.search(query,
                                   k=10000,
                                   query_generator=None,
                                   strip_segment_id=True,
                                   remove_dups=True)
            docid_score_pair = [(hit.docid, hit.score) for hit in hits]
            run = TrecRun.from_search_results(docid_score_pair, topic=topic)
            runs.append(run)

        all_topics_run = TrecRun.concat(runs)
        all_topics_run.save_to_txt(output_path='fused.txt',
                                   tag='reciprocal_rank_fusion_k=60')

        # Only keep topic, docid and rank. Scores have different floating point precisions.
        os.system("""awk '{print $1" "$3" "$4}' fused.txt > this.txt""")
        os.system(
            """awk '{print $1" "$3" "$4}' anserini.covid-r2.fusion1.txt > that.txt"""
        )

        self.assertTrue(filecmp.cmp('this.txt', 'that.txt'))
Exemplo n.º 2
0
    def test_simple_fusion_searcher(self):
        index_dirs = ['indexes/lucene-index-cord19-abstract-2020-05-01/',
                      'indexes/lucene-index-cord19-full-text-2020-05-01/',
                      'indexes/lucene-index-cord19-paragraph-2020-05-01/']

        searcher = SimpleFusionSearcher(index_dirs, method=FusionMethod.RRF)

        runs, topics = [], get_topics('covid-round2')
        for topic in tqdm(sorted(topics.keys())):
            query = topics[topic]['question'] + ' ' + topics[topic]['query']
            hits = searcher.search(query, k=10000, query_generator=None, strip_segment_id=True, remove_dups=True)
            docid_score_pair = [(hit.docid, hit.score) for hit in hits]
            run = TrecRun.from_search_results(docid_score_pair, topic=topic)
            runs.append(run)

        all_topics_run = TrecRun.concat(runs)
        all_topics_run.save_to_txt(output_path='runs/fused.txt', tag='reciprocal_rank_fusion_k=60')

        # Only keep topic, docid, and rank. Scores may be slightly different due to floating point precision issues and underlying lib versions.
        # TODO: We should probably do this in Python as opposed to calling out to shell for better portability.
        # This has also proven to be a somewhat brittle test, see https://github.com/castorini/pyserini/issues/947
        # A stopgap for above issue, we're restricting comparison to only top-100 ranks.
        os.system("""awk '$4 <= 100 {print $1" "$3" "$4}' runs/fused.txt > runs/this.txt""")
        os.system("""awk '$4 <= 100 {print $1" "$3" "$4}' runs/anserini.covid-r2.fusion1.txt > runs/that.txt""")

        self.assertTrue(filecmp.cmp('runs/this.txt', 'runs/that.txt'))