예제 #1
0
 def to_relevance_examples(self,
                           index_path: str,
                           is_duo: bool = False) -> List[RelevanceExample]:
     loader = MsMarcoPassageLoader(index_path)
     example_map = {}
     for (qid, text, rel_cands), cands in self.query_passage_tuples():
         if qid not in example_map:
             example_map[qid] = [convert_to_unicode(text), [], [], []]
         example_map[qid][1].append([cand for cand in cands][0])
         try:
             passages = [loader.load_passage(cand) for cand in cands]
             example_map[qid][2].append([
                 convert_to_unicode(passage.all_text)
                 for passage in passages
             ][0])
         except ValueError:
             logging.warning(f'Skipping {passages}')
             continue
         example_map[qid][3].append(cands[0] in rel_cands)
     mean_stats = defaultdict(list)
     for ex in self.examples:
         int_rels = np.array(list(map(int, example_map[ex.qid][3])))
         p = int_rels.sum() / (len(ex.candidates) -
                               1) if is_duo else int_rels.sum()
         mean_stats['Random P@1'].append(np.mean(int_rels))
         n = len(ex.candidates) - p
         N = len(ex.candidates)
         if len(ex.candidates) <= 1000:
             mean_stats['Random R@1000'].append(1 if 1 in int_rels else 0)
         numer = np.array(
             [sp.comb(n, i) / (N - i)
              for i in range(0, n + 1) if i != N]) * p
         if n == N:
             numer = np.append(numer, 0)
         denom = np.array([sp.comb(N, i) for i in range(0, n + 1)])
         rr = 1 / np.arange(1, n + 2)
         rmrr = np.sum(numer * rr / denom)
         mean_stats['Random MRR'].append(rmrr)
         rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10])
         mean_stats['Random MRR@10'].append(rmrr10)
         ex_index = len(ex.candidates)
         for rel_cand in ex.relevant_candidates:
             if rel_cand in ex.candidates:
                 ex_index = min(ex.candidates.index(rel_cand), ex_index)
         mean_stats['Existing MRR'].append(
             1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0)
         mean_stats['Existing MRR@10'].append(1 /
                                              (ex_index +
                                               1) if ex_index < 10 else 0)
     for k, v in mean_stats.items():
         logging.info(f'{k}: {np.mean(v)}')
     return [
         RelevanceExample(
             Query(text=query_text, id=qid),
             list(
                 map(lambda s: Text(s[1], dict(docid=s[0])),
                     zip(cands, cands_text))), rel_cands)
         for qid, (query_text, cands, cands_text,
                   rel_cands) in example_map.items()
     ]
예제 #2
0
 def to_senticized_dataset(self,
                           index_path: str,
                           split: str = 'nq') -> List[RelevanceExample]:
     loader = Cord19DocumentLoader(index_path)
     tokenizer = SpacySenticizer()
     example_map = OrderedDict()
     rel_map = OrderedDict()
     for query, document in self.query_answer_pairs(split=split):
         if document.id == MISSING_ID:
             logging.warning(f'Skipping {document.title} (missing ID)')
             continue
         key = (query, document.id)
         try:
             doc = loader.load_document(document.id)
             example_map.setdefault(key, tokenizer(doc.all_text))
         except ValueError as e:
             logging.warning(f'Skipping {document.id} ({e})')
             continue
         sents = example_map[key]
         rel_map.setdefault(key, [False] * len(sents))
         for idx, s in enumerate(sents):
             if document.exact_answer in s:
                 rel_map[key][idx] = True
     mean_stats = defaultdict(list)
     for (_, doc_id), rels in rel_map.items():
         int_rels = np.array(list(map(int, rels)))
         p = int_rels.sum()
         mean_stats['Average spans'].append(p)
         mean_stats['Expected P@1 for Random Ordering'].append(
             np.mean(int_rels))
         n = len(int_rels) - p
         N = len(int_rels)
         mean_stats['Expected R@3 for Random Ordering'].append(1 -
                                                               (n *
                                                                (n - 1) *
                                                                (n - 2)) /
                                                               (N *
                                                                (N - 1) *
                                                                (N - 2)))
         numer = np.array(
             [sp.comb(n, i) / (N - i) for i in range(0, n + 1)]) * p
         denom = np.array([sp.comb(N, i) for i in range(0, n + 1)])
         rr = 1 / np.arange(1, n + 2)
         rmrr = np.sum(numer * rr / denom)
         mean_stats['Expected MRR for Random Ordering'].append(rmrr)
         if not any(rels):
             logging.warning(f'{doc_id} has no relevant answers')
     for k, v in mean_stats.items():
         logging.info(f'{k}: {np.mean(v)}')
     return [
         RelevanceExample(
             Query(query),
             list(map(lambda s: Text(s, dict(docid=docid)), sents)), rels)
         for ((query, docid),
              sents), (_, rels) in zip(example_map.items(), rel_map.items())
     ]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name_or_path",
                        default='unicamp-dl/mt5-base-multi-msmarco',
                        type=str,
                        required=False,
                        help="Reranker model.")
    parser.add_argument("--initial_run",
                        default=None,
                        type=str,
                        required=True,
                        help="Initial run to be reranked.")
    parser.add_argument("--corpus",
                        default=None,
                        type=str,
                        required=True,
                        help="Document collection.")
    parser.add_argument("--output_run",
                        default=None,
                        type=str,
                        required=True,
                        help="Path to save the reranked run.")
    parser.add_argument("--queries",
                        default=None,
                        type=str,
                        required=True,
                        help="Path to the queries file.")

    args = parser.parse_args()
    model = MonoT5(args.model_name_or_path)
    run = load_run(args.initial_run)
    corpus = load_corpus(args.corpus)
    queries = load_queries(args.queries)

    # Run reranker
    trec = open(args.output_run + '-trec.txt', 'w')
    marco = open(args.output_run + '-marco.txt', 'w')
    for idx, query_id in enumerate(tqdm(run.keys())):
        query = Query(queries[query_id])
        texts = [
            Text(corpus[doc_id], {'docid': doc_id}, 0)
            for doc_id in run[query_id]
        ]
        reranked = model.rerank(query, texts)
        for rank, document in enumerate(reranked):
            trec.write(
                f'{query_id}\tQ0\t{document.metadata["docid"]}\t{rank+1}\t{document.score}\t{args.model_name_or_path}\n'
            )
            marco.write(
                f'{query_id}\t{document.metadata["docid"]}\t{rank+1}\n')
    trec.close()
    marco.close()
    print("Done!")
예제 #4
0
    def segment(self, documents: List[Text], seg_size: int,
                stride: int) -> SegmentGroup:
        """
        Breaks each document into segments.  For example, given a document with sentences [1,2,3,4,5], a seg_size of 3,
        and a stride of 2, the document will be broken into segments [[1, 2, 3], [3, 4, 5], and [5]].  If the document's
        text is empty, a single segment containing the document's title is generated.  Otherwise, the document's title
        is prepended to the document's text.

        :param documents: A list of Text objects, each of which corresponds to an indexed document.
        :param seg_size: The number of sentences each segment should contain.
        :param stride: The number of sentences to advance for the next segment.
        :return: A SegmentGroup containing all the documents' segments and the end index of each document in
        segmented_docs.
        """
        segmented_docs, doc_end_indexes, end_idx = [], [0], 0
        for document in documents:
            doc = self.nlp(document.text[:self.max_characters])
            sentences = [sent.string.strip() for sent in doc.sents]
            # If the text is empty (i.e. there are no sentences), the segment_text is solely the title of the document.
            if len(sentences) == 0:
                segment_text = document.title
                segmented_docs.append(
                    Text(segment_text, dict(docid=document.metadata["docid"])))
                end_idx += 1
                doc_end_indexes.append(int(end_idx))
            else:
                for i in range(0, len(sentences), stride):
                    segment_text = ' '.join(sentences[i:i + seg_size])
                    if document.title and (not document.title == ''):
                        segment_text = document.title + '. ' + segment_text
                    segmented_docs.append(
                        Text(segment_text,
                             dict(docid=document.metadata["docid"])))
                    if i + seg_size >= len(sentences):
                        end_idx += i / stride + 1
                        doc_end_indexes.append(int(end_idx))
                        break
        return SegmentGroup(segmented_docs, doc_end_indexes)
예제 #5
0
    def segment(self, documents: List[Text], seg_size: int, stride: int) -> SegmentGroup:
        segmented_doc, doc_end_indexes, end_idx = [], [0], 0

        for document in documents:
            doc = self.nlp(document.text[:self.max_characters])
            sentences = [sent.string.strip() for sent in doc.sents]
            for i in range(0, len(sentences), stride):
                segment_text = ' '.join(sentences[i:i + seg_size])
                segmented_doc.append(Text(segment_text, dict(docid=document.raw["docid"])))
                if i + seg_size >= len(sentences):
                    end_idx += i/stride + 1
                    doc_end_indexes.append(int(end_idx))
                    break
        return SegmentGroup(segmented_doc, doc_end_indexes)
예제 #6
0
파일: bertqe.py 프로젝트: cep-ter/ReQue
 def Bert_Score(self, q, doc_dic_for_bert):
     chunk_scores = {}
     query = Query(q)
     texts = [Text(p[1], {'docid': p[0]}, 0) for p in doc_dic_for_bert]
     reranked = reranker.rerank(query, texts)
     reranked.sort(key=lambda x: x.score, reverse=True)
     for i in range(0, 10):
         chunk_text = reranked[i].text
         word_tokens = word_tokenize(chunk_text)
         filtered_sentence = [w for w in word_tokens if not w in stop_words]
         filtered_sentence = (" ").join(filtered_sentence).translate(
             str.maketrans('', '', string.punctuation))
         chunk_scores[filtered_sentence] = round(reranked[i].score, 3)
         #print(f'{i+1:2} {reranked[i].score:.5f} {reranked[i].text}')
     return chunk_scores
예제 #7
0
    def to_relevance_examples(self, index_path: str) -> List[RelevanceExample]:
        loader = Cord19AbstractLoader(index_path)
        example_map = {}
        for (qid, text,
             rel_cands), cands in tqdm(self.query_document_tuples()):
            if qid not in example_map:
                example_map[qid] = [convert_to_unicode(text), [], [], [], []]
            example_map[qid][1].append([cand for cand in cands][0])
            try:
                passages = [loader.load_document(cand) for cand in cands]
                # Sometimes this abstract is empty.
                example_map[qid][2].append([
                    convert_to_unicode(passage.abstract)
                    for passage in passages
                ][0])
                example_map[qid][4].append([
                    convert_to_unicode(passage.title) for passage in passages
                ][0])
            except ValueError as e:
                logging.error(e)
                logging.warning('Skipping passages')
                continue
            example_map[qid][3].append(cands[0] in rel_cands)
        mean_stats = defaultdict(list)

        for ex in self.examples:
            int_rels = np.array(list(map(int, example_map[ex.qid][3])))
            p = int(int_rels.sum())
            mean_stats['Expected P@1 for Random Ordering'].append(
                np.mean(int_rels))
            n = len(ex.candidates) - p
            N = len(ex.candidates)
            if len(ex.candidates) <= 1000:
                mean_stats['Expected R@1000 for Random Ordering'].append(
                    1 if 1 in int_rels else 0)
            numer = np.array(
                [sp.comb(n, i) / (N - i)
                 for i in range(0, n + 1) if i != N]) * p
            if n == N:
                numer = np.append(numer, 0)
            denom = np.array([sp.comb(N, i) for i in range(0, n + 1)])
            rr = 1 / np.arange(1, n + 2)
            rmrr = np.sum(numer * rr / denom)
            mean_stats['Expected MRR for Random Ordering'].append(rmrr)
            rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10])
            mean_stats['Expected MRR@10 for Random Ordering'].append(rmrr10)
            ex_index = len(ex.candidates)
            for rel_cand in ex.relevant_candidates:
                if rel_cand in ex.candidates:
                    ex_index = min(ex.candidates.index(rel_cand), ex_index)
            mean_stats['Existing MRR'].append(
                1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0)
            mean_stats['Existing MRR@10'].append(1 /
                                                 (ex_index +
                                                  1) if ex_index < 10 else 0)
        for k, v in mean_stats.items():
            logging.info(f'{k}: {np.mean(v)}')
        rel = [
            RelevanceExample(
                Query(text=query_text, id=qid),
                list(
                    map(lambda s: Text(s[1], dict(docid=s[0]), title=s[2]),
                        zip(cands, cands_text, title))), rel_cands)
            for qid, (query_text, cands, cands_text, rel_cands,
                      title) in example_map.items()
        ]
        return rel
예제 #8
0
        n_docs += 1
        doc_text = corpus[doc_id]
        doc = nlp(doc_text[:10000])
        sentences = [str(sent).strip() for sent in doc.sents]
        for i in range(0, len(sentences), args.stride):
            segment = ' '.join(sentences[i:i + args.max_length])
            passages.append([doc_id, segment])
            n_segments += 1
            if i + args.max_length >= len(sentences):
                break

    print(f'{query_id}: Reranking...')

    # Reranker using pygaggle
    query = Query(query_text)
    texts = [Text(p[1], {'docid': p[0]}, 0) for p in passages]
    start = time()
    ranked_results = reranker.rerank(query, texts)
    end = time()
    elapsed_time = end - start
    print("Time Elapsed: {:.1f}".format(elapsed_time))

    # Get scores from reranker
    final_t5_scores = {}
    for result in ranked_results:
        if result.metadata["docid"] not in final_t5_scores:
            final_t5_scores[result.metadata["docid"]] = result.score
        else:
            if final_t5_scores[result.metadata["docid"]] < result.score:
                final_t5_scores[result.metadata["docid"]] = result.score
예제 #9
0
 def rerank(self, query: str, texts: List[str]) -> List[float]:
     ranked_results = self.ranker.rerank(Query(query),
                                         [Text(t) for t in texts])
     scores = [r.score for r in ranked_results]
     return scores
예제 #10
0
def main():
    apb = ArgumentParserBuilder()
    apb.add_opts(
        opt('--task', type=str, default='wikipedia'),
        opt('--method', type=str, required=True, choices=METHOD_CHOICES),
        opt('--retrieval-file',
            type=Path,
            required=True,
            help=
            'JSON file containing top passages selected by the retrieval model'
            ),
        opt('--model-name',
            type=str,
            default='facebook/dpr-reader-single-nq-base',
            help='Pretrained model for reader'),
        opt('--tokenizer-name',
            type=str,
            default='facebook/dpr-reader-single-nq-base',
            help='Pretrained model for tokenizer'),
        opt('--num-spans',
            type=int,
            default=1,
            help='Number of answer spans to return'),
        opt('--max-answer-length',
            type=int,
            default=10,
            help='Maximum length that an answer span can be'),
        opt('--num-spans-per-passage',
            type=int,
            default=10,
            help='Maximum number of answer spans to return per passage'),
        opt('--output-file',
            type=Path,
            default=None,
            help=
            'File to output predictions for each example; if not specified, this output will be discarded'
            ),
        opt('--device',
            type=str,
            default='cuda:0',
            help='Device for model computations'),
        opt('--batch-size',
            type=int,
            default=16,
            help='batch size of reader inference'),
        opt('--topk-retrieval',
            type=int,
            default=[],
            nargs='+',
            help='Values of k to print the topk accuracy of the retrieval file'
            ),
        opt('--topk-em',
            type=int,
            default=[50],
            nargs='+',
            help='Values of k to print the topk exact match score'),
    )
    args = apb.parser.parse_args()
    options = PassageReadingEvaluationOptions(**vars(args))

    logging.info("Loading the Retrieval File")
    with open(options.retrieval_file) as f:
        data = json.load(f)

    if args.topk_retrieval:
        logging.info("Evaluating Topk Retrieval Accuracies")
        subprocess.call([
            'python', 'tools/scripts/dpr/evaluate_retrieval.py', '--retrieval',
            options.retrieval_file, '--topk', *map(str, args.topk_retrieval)
        ])

    logging.info("Loading Reader Model and Tokenizer")
    construct_map = dict(dpr=construct_dpr, )
    reader = construct_map[options.method](options)

    evaluator = ReaderEvaluator(reader)

    max_topk_passages = max(options.topk_em)
    examples = []
    for _, item in data.items():
        examples.append(
            RetrievalExample(
                query=Query(text=item["question"]),
                texts=list(
                    map(
                        lambda context: Text(text=context["text"].split(
                            '\n', 1)[1].replace('""', '"'),
                                             title=context["text"].split(
                                                 '\n', 1)[0].replace('"', '')),
                        item["contexts"]))[:max_topk_passages],
                ground_truth_answers=item["answers"],
            ))
    dpr_predictions = [] if args.output_file is not None else None

    ems = evaluator.evaluate(examples, options.topk_em, dpr_predictions)

    logging.info('Reader completed')

    for k in options.topk_em:
        em = np.mean(np.array(ems[k])) * 100.
        logging.info(f'Top{k}\tExact Match Accuracy: {em}')

    if args.output_file is not None:
        with open(args.output_file, 'w', encoding='utf-8', newline='\n') as f:
            json.dump(dpr_predictions, f, indent=4)