Пример #1
0
 def to_relevance_examples(self,
                           index_path: str,
                           is_duo: bool = False) -> List[RelevanceExample]:
     loader = MsMarcoPassageLoader(index_path)
     example_map = {}
     for (qid, text, rel_cands), cands in self.query_passage_tuples():
         if qid not in example_map:
             example_map[qid] = [convert_to_unicode(text), [], [], []]
         example_map[qid][1].append([cand for cand in cands][0])
         try:
             passages = [loader.load_passage(cand) for cand in cands]
             example_map[qid][2].append([
                 convert_to_unicode(passage.all_text)
                 for passage in passages
             ][0])
         except ValueError:
             logging.warning(f'Skipping {passages}')
             continue
         example_map[qid][3].append(cands[0] in rel_cands)
     mean_stats = defaultdict(list)
     for ex in self.examples:
         int_rels = np.array(list(map(int, example_map[ex.qid][3])))
         p = int_rels.sum() / (len(ex.candidates) -
                               1) if is_duo else int_rels.sum()
         mean_stats['Random P@1'].append(np.mean(int_rels))
         n = len(ex.candidates) - p
         N = len(ex.candidates)
         if len(ex.candidates) <= 1000:
             mean_stats['Random R@1000'].append(1 if 1 in int_rels else 0)
         numer = np.array(
             [sp.comb(n, i) / (N - i)
              for i in range(0, n + 1) if i != N]) * p
         if n == N:
             numer = np.append(numer, 0)
         denom = np.array([sp.comb(N, i) for i in range(0, n + 1)])
         rr = 1 / np.arange(1, n + 2)
         rmrr = np.sum(numer * rr / denom)
         mean_stats['Random MRR'].append(rmrr)
         rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10])
         mean_stats['Random MRR@10'].append(rmrr10)
         ex_index = len(ex.candidates)
         for rel_cand in ex.relevant_candidates:
             if rel_cand in ex.candidates:
                 ex_index = min(ex.candidates.index(rel_cand), ex_index)
         mean_stats['Existing MRR'].append(
             1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0)
         mean_stats['Existing MRR@10'].append(1 /
                                              (ex_index +
                                               1) if ex_index < 10 else 0)
     for k, v in mean_stats.items():
         logging.info(f'{k}: {np.mean(v)}')
     return [
         RelevanceExample(
             Query(text=query_text, id=qid),
             list(
                 map(lambda s: Text(s[1], dict(docid=s[0])),
                     zip(cands, cands_text))), rel_cands)
         for qid, (query_text, cands, cands_text,
                   rel_cands) in example_map.items()
     ]
Пример #2
0
 def to_senticized_dataset(self,
                           index_path: str,
                           split: str = 'nq') -> List[RelevanceExample]:
     loader = Cord19DocumentLoader(index_path)
     tokenizer = SpacySenticizer()
     example_map = OrderedDict()
     rel_map = OrderedDict()
     for query, document in self.query_answer_pairs(split=split):
         if document.id == MISSING_ID:
             logging.warning(f'Skipping {document.title} (missing ID)')
             continue
         key = (query, document.id)
         try:
             doc = loader.load_document(document.id)
             example_map.setdefault(key, tokenizer(doc.all_text))
         except ValueError as e:
             logging.warning(f'Skipping {document.id} ({e})')
             continue
         sents = example_map[key]
         rel_map.setdefault(key, [False] * len(sents))
         for idx, s in enumerate(sents):
             if document.exact_answer in s:
                 rel_map[key][idx] = True
     mean_stats = defaultdict(list)
     for (_, doc_id), rels in rel_map.items():
         int_rels = np.array(list(map(int, rels)))
         p = int_rels.sum()
         mean_stats['Average spans'].append(p)
         mean_stats['Expected P@1 for Random Ordering'].append(
             np.mean(int_rels))
         n = len(int_rels) - p
         N = len(int_rels)
         mean_stats['Expected R@3 for Random Ordering'].append(1 -
                                                               (n *
                                                                (n - 1) *
                                                                (n - 2)) /
                                                               (N *
                                                                (N - 1) *
                                                                (N - 2)))
         numer = np.array(
             [sp.comb(n, i) / (N - i) for i in range(0, n + 1)]) * p
         denom = np.array([sp.comb(N, i) for i in range(0, n + 1)])
         rr = 1 / np.arange(1, n + 2)
         rmrr = np.sum(numer * rr / denom)
         mean_stats['Expected MRR for Random Ordering'].append(rmrr)
         if not any(rels):
             logging.warning(f'{doc_id} has no relevant answers')
     for k, v in mean_stats.items():
         logging.info(f'{k}: {np.mean(v)}')
     return [
         RelevanceExample(
             Query(query),
             list(map(lambda s: Text(s, dict(docid=docid)), sents)), rels)
         for ((query, docid),
              sents), (_, rels) in zip(example_map.items(), rel_map.items())
     ]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name_or_path",
                        default='unicamp-dl/mt5-base-multi-msmarco',
                        type=str,
                        required=False,
                        help="Reranker model.")
    parser.add_argument("--initial_run",
                        default=None,
                        type=str,
                        required=True,
                        help="Initial run to be reranked.")
    parser.add_argument("--corpus",
                        default=None,
                        type=str,
                        required=True,
                        help="Document collection.")
    parser.add_argument("--output_run",
                        default=None,
                        type=str,
                        required=True,
                        help="Path to save the reranked run.")
    parser.add_argument("--queries",
                        default=None,
                        type=str,
                        required=True,
                        help="Path to the queries file.")

    args = parser.parse_args()
    model = MonoT5(args.model_name_or_path)
    run = load_run(args.initial_run)
    corpus = load_corpus(args.corpus)
    queries = load_queries(args.queries)

    # Run reranker
    trec = open(args.output_run + '-trec.txt', 'w')
    marco = open(args.output_run + '-marco.txt', 'w')
    for idx, query_id in enumerate(tqdm(run.keys())):
        query = Query(queries[query_id])
        texts = [
            Text(corpus[doc_id], {'docid': doc_id}, 0)
            for doc_id in run[query_id]
        ]
        reranked = model.rerank(query, texts)
        for rank, document in enumerate(reranked):
            trec.write(
                f'{query_id}\tQ0\t{document.metadata["docid"]}\t{rank+1}\t{document.score}\t{args.model_name_or_path}\n'
            )
            marco.write(
                f'{query_id}\t{document.metadata["docid"]}\t{rank+1}\n')
    trec.close()
    marco.close()
    print("Done!")
    def rerank(self, query, hits):
        if self.reranker is None:
            logging.info("Reranker not available, skipping reranking")
            return hits

        reranked = self.reranker.rerank(Query(query), hits_to_texts(hits))
        reranked_scores = [r.score for r in reranked]

        # Reorder hits with reranker scores
        reranked = list(zip(hits, reranked_scores))
        reranked.sort(key=lambda x: x[1], reverse=True)
        reranked_hits = [r[0] for r in reranked]
        return reranked_hits
def main(output_path=OUTPUT_PATH,
         index_path=INDEX_PATH,
         queries_path=QUERIES_PATH,
         run=RUN,
         k=K):
    print('################################################')
    print("##### Performing Passage Ranking using L2R #####")
    print('################################################')
    print("Output will be placed in:", output_path,
          ", format used will be TREC")
    print('Loading pre-trained model MonoT5...')
    from pygaggle.rerank.transformer import MonoT5
    reranker = MonoT5()

    print('Fetching anserini-like indices from:', index_path)
    # fetch some passages to rerank from MS MARCO with Pyserini (BM25)
    searcher = SimpleSearcher(index_path)
    print('Loading queries from:', queries_path)
    with open(queries_path, 'r') as f:
        content = f.readlines()
        content = [x.strip().split('\t') for x in content]
        queries = [Query(x[1], x[0]) for x in content]
    print(f'Ranking queries using BM25 (k={k})')
    queries_text = []
    for query in tqdm(queries):
        hits = searcher.search(query.text, k=K)
        texts = hits_to_texts(hits)
        queries_text.append(texts)

    print('Reranking all queries using MonoT5!')
    rankings = []

    for (i, query) in enumerate(tqdm(queries)):
        reranked = reranker.rerank(query, queries_text[i])
        reranked.sort(key=lambda x: x.score, reverse=True)
        rankings.append(reranked)

    print('Outputting to file...')
    if '.tsv' in output_path:
        output_to_tsv(queries, rankings, run, output_path)
    elif '.csv' in output_path:
        output_to_csv(queries, rankings, run, output_path)
    else:
        print(
            'ERROR: invalid output file format provided, please use either .csv or .tsv. Exiting'
        )
        sys.exit(1)
    print('SUCCESS: completed reranking, you may check the output at:',
          output_path)
    sys.exit(0)
Пример #6
0
 def Bert_Score(self, q, doc_dic_for_bert):
     chunk_scores = {}
     query = Query(q)
     texts = [Text(p[1], {'docid': p[0]}, 0) for p in doc_dic_for_bert]
     reranked = reranker.rerank(query, texts)
     reranked.sort(key=lambda x: x.score, reverse=True)
     for i in range(0, 10):
         chunk_text = reranked[i].text
         word_tokens = word_tokenize(chunk_text)
         filtered_sentence = [w for w in word_tokens if not w in stop_words]
         filtered_sentence = (" ").join(filtered_sentence).translate(
             str.maketrans('', '', string.punctuation))
         chunk_scores[filtered_sentence] = round(reranked[i].score, 3)
         #print(f'{i+1:2} {reranked[i].score:.5f} {reranked[i].text}')
     return chunk_scores
Пример #7
0
    def test_basic(self):
        hits = self.searcher.search('information retrieval')

        self.assertTrue(isinstance(hits, List))

        self.assertTrue(isinstance(hits[0], JSimpleSearcherResult))
        self.assertEqual('CACM-3134', hits[0].docid)
        self.assertEqual(3133, hits[0].lucene_docid)
        self.assertEqual(1500, len(hits[0].contents))
        self.assertEqual(1532, len(hits[0].raw))
        self.assertAlmostEqual(4.76550, hits[0].score, places=5)

        texts = hits_to_texts(hits)
        self.assertEqual(len(hits), len(texts))
        self.assertTrue(isinstance(texts, List))
        self.assertTrue(isinstance(texts[0], Text))

        for i in range(0, len(hits)):
            self.assertEqual(hits[i].raw, texts[i].text)
            self.assertAlmostEqual(hits[i].score, texts[i].score, places=5)

        query = Query('dummy query')
        identity_reranker = IdentityReranker()
        self.assertTrue(isinstance(identity_reranker, Reranker))

        output = identity_reranker.rerank(query, texts)

        # Check that reranked output is indeed the same as the input
        for i in range(0, len(hits)):
            self.assertEqual(texts[i].text, output[i].text)
            self.assertEqual(texts[i].metadata, output[i].metadata)
            self.assertAlmostEqual(texts[i].score, output[i].score, places=5)

        # Check that the identity rerank was not destructive
        texts = []
        for i in range(0, len(hits)):
            self.assertEqual(hits[i].raw, output[i].text)
            self.assertAlmostEqual(hits[i].score, output[i].score, places=5)
Пример #8
0
    def to_relevance_examples(self, index_path: str) -> List[RelevanceExample]:
        loader = Cord19AbstractLoader(index_path)
        example_map = {}
        for (qid, text,
             rel_cands), cands in tqdm(self.query_document_tuples()):
            if qid not in example_map:
                example_map[qid] = [convert_to_unicode(text), [], [], [], []]
            example_map[qid][1].append([cand for cand in cands][0])
            try:
                passages = [loader.load_document(cand) for cand in cands]
                # Sometimes this abstract is empty.
                example_map[qid][2].append([
                    convert_to_unicode(passage.abstract)
                    for passage in passages
                ][0])
                example_map[qid][4].append([
                    convert_to_unicode(passage.title) for passage in passages
                ][0])
            except ValueError as e:
                logging.error(e)
                logging.warning('Skipping passages')
                continue
            example_map[qid][3].append(cands[0] in rel_cands)
        mean_stats = defaultdict(list)

        for ex in self.examples:
            int_rels = np.array(list(map(int, example_map[ex.qid][3])))
            p = int(int_rels.sum())
            mean_stats['Expected P@1 for Random Ordering'].append(
                np.mean(int_rels))
            n = len(ex.candidates) - p
            N = len(ex.candidates)
            if len(ex.candidates) <= 1000:
                mean_stats['Expected R@1000 for Random Ordering'].append(
                    1 if 1 in int_rels else 0)
            numer = np.array(
                [sp.comb(n, i) / (N - i)
                 for i in range(0, n + 1) if i != N]) * p
            if n == N:
                numer = np.append(numer, 0)
            denom = np.array([sp.comb(N, i) for i in range(0, n + 1)])
            rr = 1 / np.arange(1, n + 2)
            rmrr = np.sum(numer * rr / denom)
            mean_stats['Expected MRR for Random Ordering'].append(rmrr)
            rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10])
            mean_stats['Expected MRR@10 for Random Ordering'].append(rmrr10)
            ex_index = len(ex.candidates)
            for rel_cand in ex.relevant_candidates:
                if rel_cand in ex.candidates:
                    ex_index = min(ex.candidates.index(rel_cand), ex_index)
            mean_stats['Existing MRR'].append(
                1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0)
            mean_stats['Existing MRR@10'].append(1 /
                                                 (ex_index +
                                                  1) if ex_index < 10 else 0)
        for k, v in mean_stats.items():
            logging.info(f'{k}: {np.mean(v)}')
        rel = [
            RelevanceExample(
                Query(text=query_text, id=qid),
                list(
                    map(lambda s: Text(s[1], dict(docid=s[0]), title=s[2]),
                        zip(cands, cands_text, title))), rel_cands)
            for qid, (query_text, cands, cands_text, rel_cands,
                      title) in example_map.items()
        ]
        return rel
Пример #9
0
            continue
        n_docs += 1
        doc_text = corpus[doc_id]
        doc = nlp(doc_text[:10000])
        sentences = [str(sent).strip() for sent in doc.sents]
        for i in range(0, len(sentences), args.stride):
            segment = ' '.join(sentences[i:i + args.max_length])
            passages.append([doc_id, segment])
            n_segments += 1
            if i + args.max_length >= len(sentences):
                break

    print(f'{query_id}: Reranking...')

    # Reranker using pygaggle
    query = Query(query_text)
    texts = [Text(p[1], {'docid': p[0]}, 0) for p in passages]
    start = time()
    ranked_results = reranker.rerank(query, texts)
    end = time()
    elapsed_time = end - start
    print("Time Elapsed: {:.1f}".format(elapsed_time))

    # Get scores from reranker
    final_t5_scores = {}
    for result in ranked_results:
        if result.metadata["docid"] not in final_t5_scores:
            final_t5_scores[result.metadata["docid"]] = result.score
        else:
            if final_t5_scores[result.metadata["docid"]] < result.score:
                final_t5_scores[result.metadata["docid"]] = result.score
Пример #10
0
 def rerank(self, query: str, texts: List[str]) -> List[float]:
     ranked_results = self.ranker.rerank(Query(query),
                                         [Text(t) for t in texts])
     scores = [r.score for r in ranked_results]
     return scores
Пример #11
0
def main():
    apb = ArgumentParserBuilder()
    apb.add_opts(
        opt('--task', type=str, default='wikipedia'),
        opt('--method', type=str, required=True, choices=METHOD_CHOICES),
        opt('--retrieval-file',
            type=Path,
            required=True,
            help=
            'JSON file containing top passages selected by the retrieval model'
            ),
        opt('--model-name',
            type=str,
            default='facebook/dpr-reader-single-nq-base',
            help='Pretrained model for reader'),
        opt('--tokenizer-name',
            type=str,
            default='facebook/dpr-reader-single-nq-base',
            help='Pretrained model for tokenizer'),
        opt('--num-spans',
            type=int,
            default=1,
            help='Number of answer spans to return'),
        opt('--max-answer-length',
            type=int,
            default=10,
            help='Maximum length that an answer span can be'),
        opt('--num-spans-per-passage',
            type=int,
            default=10,
            help='Maximum number of answer spans to return per passage'),
        opt('--output-file',
            type=Path,
            default=None,
            help=
            'File to output predictions for each example; if not specified, this output will be discarded'
            ),
        opt('--device',
            type=str,
            default='cuda:0',
            help='Device for model computations'),
        opt('--batch-size',
            type=int,
            default=16,
            help='batch size of reader inference'),
        opt('--topk-retrieval',
            type=int,
            default=[],
            nargs='+',
            help='Values of k to print the topk accuracy of the retrieval file'
            ),
        opt('--topk-em',
            type=int,
            default=[50],
            nargs='+',
            help='Values of k to print the topk exact match score'),
    )
    args = apb.parser.parse_args()
    options = PassageReadingEvaluationOptions(**vars(args))

    logging.info("Loading the Retrieval File")
    with open(options.retrieval_file) as f:
        data = json.load(f)

    if args.topk_retrieval:
        logging.info("Evaluating Topk Retrieval Accuracies")
        subprocess.call([
            'python', 'tools/scripts/dpr/evaluate_retrieval.py', '--retrieval',
            options.retrieval_file, '--topk', *map(str, args.topk_retrieval)
        ])

    logging.info("Loading Reader Model and Tokenizer")
    construct_map = dict(dpr=construct_dpr, )
    reader = construct_map[options.method](options)

    evaluator = ReaderEvaluator(reader)

    max_topk_passages = max(options.topk_em)
    examples = []
    for _, item in data.items():
        examples.append(
            RetrievalExample(
                query=Query(text=item["question"]),
                texts=list(
                    map(
                        lambda context: Text(text=context["text"].split(
                            '\n', 1)[1].replace('""', '"'),
                                             title=context["text"].split(
                                                 '\n', 1)[0].replace('"', '')),
                        item["contexts"]))[:max_topk_passages],
                ground_truth_answers=item["answers"],
            ))
    dpr_predictions = [] if args.output_file is not None else None

    ems = evaluator.evaluate(examples, options.topk_em, dpr_predictions)

    logging.info('Reader completed')

    for k in options.topk_em:
        em = np.mean(np.array(ems[k])) * 100.
        logging.info(f'Top{k}\tExact Match Accuracy: {em}')

    if args.output_file is not None:
        with open(args.output_file, 'w', encoding='utf-8', newline='\n') as f:
            json.dump(dpr_predictions, f, indent=4)
Пример #12
0
    print(len(query_dict))

    pass_dict = {}
    with open(f"{args.input_file}.trec_json") as f:
        for line in f:
            a, b = line.strip().split("indri #")
            qid, _, docid, _, _ = a.strip().split()
            if qid not in pass_dict:
                pass_dict[qid] = {}
            temp = json.loads(b)
            pass_dict[qid][docid] = temp["doc"][
                "body"]  # + " " + temp["doc"]["title"]
    print(len(pass_dict))

    lines = []
    for qid in tqdm(query_dict):
        query = Query(query_dict[qid])
        passages = pass_dict[qid].items()
        texts = [Text(p[1], {'docid': p[0]}, 0) for p in passages]

        reranked = reranker.rerank(query, texts)
        reranked.sort(key=lambda x: x.score, reverse=True)

        for i, res in enumerate(reranked, start=1):
            docid = res.raw["docid"]
            score = res.score
            lines.append(f"{qid} Q0 {docid} {i} {score} bert\n")

    with open(f"{args.input_file}_{args.query_field}_t5", "w") as f:
        f.write("".join(lines))