def to_relevance_examples(self, index_path: str, is_duo: bool = False) -> List[RelevanceExample]: loader = MsMarcoPassageLoader(index_path) example_map = {} for (qid, text, rel_cands), cands in self.query_passage_tuples(): if qid not in example_map: example_map[qid] = [convert_to_unicode(text), [], [], []] example_map[qid][1].append([cand for cand in cands][0]) try: passages = [loader.load_passage(cand) for cand in cands] example_map[qid][2].append([ convert_to_unicode(passage.all_text) for passage in passages ][0]) except ValueError: logging.warning(f'Skipping {passages}') continue example_map[qid][3].append(cands[0] in rel_cands) mean_stats = defaultdict(list) for ex in self.examples: int_rels = np.array(list(map(int, example_map[ex.qid][3]))) p = int_rels.sum() / (len(ex.candidates) - 1) if is_duo else int_rels.sum() mean_stats['Random P@1'].append(np.mean(int_rels)) n = len(ex.candidates) - p N = len(ex.candidates) if len(ex.candidates) <= 1000: mean_stats['Random R@1000'].append(1 if 1 in int_rels else 0) numer = np.array( [sp.comb(n, i) / (N - i) for i in range(0, n + 1) if i != N]) * p if n == N: numer = np.append(numer, 0) denom = np.array([sp.comb(N, i) for i in range(0, n + 1)]) rr = 1 / np.arange(1, n + 2) rmrr = np.sum(numer * rr / denom) mean_stats['Random MRR'].append(rmrr) rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10]) mean_stats['Random MRR@10'].append(rmrr10) ex_index = len(ex.candidates) for rel_cand in ex.relevant_candidates: if rel_cand in ex.candidates: ex_index = min(ex.candidates.index(rel_cand), ex_index) mean_stats['Existing MRR'].append( 1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0) mean_stats['Existing MRR@10'].append(1 / (ex_index + 1) if ex_index < 10 else 0) for k, v in mean_stats.items(): logging.info(f'{k}: {np.mean(v)}') return [ RelevanceExample( Query(text=query_text, id=qid), list( map(lambda s: Text(s[1], dict(docid=s[0])), zip(cands, cands_text))), rel_cands) for qid, (query_text, cands, cands_text, rel_cands) in example_map.items() ]
def to_senticized_dataset(self, index_path: str, split: str = 'nq') -> List[RelevanceExample]: loader = Cord19DocumentLoader(index_path) tokenizer = SpacySenticizer() example_map = OrderedDict() rel_map = OrderedDict() for query, document in self.query_answer_pairs(split=split): if document.id == MISSING_ID: logging.warning(f'Skipping {document.title} (missing ID)') continue key = (query, document.id) try: doc = loader.load_document(document.id) example_map.setdefault(key, tokenizer(doc.all_text)) except ValueError as e: logging.warning(f'Skipping {document.id} ({e})') continue sents = example_map[key] rel_map.setdefault(key, [False] * len(sents)) for idx, s in enumerate(sents): if document.exact_answer in s: rel_map[key][idx] = True mean_stats = defaultdict(list) for (_, doc_id), rels in rel_map.items(): int_rels = np.array(list(map(int, rels))) p = int_rels.sum() mean_stats['Average spans'].append(p) mean_stats['Expected P@1 for Random Ordering'].append( np.mean(int_rels)) n = len(int_rels) - p N = len(int_rels) mean_stats['Expected R@3 for Random Ordering'].append(1 - (n * (n - 1) * (n - 2)) / (N * (N - 1) * (N - 2))) numer = np.array( [sp.comb(n, i) / (N - i) for i in range(0, n + 1)]) * p denom = np.array([sp.comb(N, i) for i in range(0, n + 1)]) rr = 1 / np.arange(1, n + 2) rmrr = np.sum(numer * rr / denom) mean_stats['Expected MRR for Random Ordering'].append(rmrr) if not any(rels): logging.warning(f'{doc_id} has no relevant answers') for k, v in mean_stats.items(): logging.info(f'{k}: {np.mean(v)}') return [ RelevanceExample( Query(query), list(map(lambda s: Text(s, dict(docid=docid)), sents)), rels) for ((query, docid), sents), (_, rels) in zip(example_map.items(), rel_map.items()) ]
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_name_or_path", default='unicamp-dl/mt5-base-multi-msmarco', type=str, required=False, help="Reranker model.") parser.add_argument("--initial_run", default=None, type=str, required=True, help="Initial run to be reranked.") parser.add_argument("--corpus", default=None, type=str, required=True, help="Document collection.") parser.add_argument("--output_run", default=None, type=str, required=True, help="Path to save the reranked run.") parser.add_argument("--queries", default=None, type=str, required=True, help="Path to the queries file.") args = parser.parse_args() model = MonoT5(args.model_name_or_path) run = load_run(args.initial_run) corpus = load_corpus(args.corpus) queries = load_queries(args.queries) # Run reranker trec = open(args.output_run + '-trec.txt', 'w') marco = open(args.output_run + '-marco.txt', 'w') for idx, query_id in enumerate(tqdm(run.keys())): query = Query(queries[query_id]) texts = [ Text(corpus[doc_id], {'docid': doc_id}, 0) for doc_id in run[query_id] ] reranked = model.rerank(query, texts) for rank, document in enumerate(reranked): trec.write( f'{query_id}\tQ0\t{document.metadata["docid"]}\t{rank+1}\t{document.score}\t{args.model_name_or_path}\n' ) marco.write( f'{query_id}\t{document.metadata["docid"]}\t{rank+1}\n') trec.close() marco.close() print("Done!")
def segment(self, documents: List[Text], seg_size: int, stride: int) -> SegmentGroup: """ Breaks each document into segments. For example, given a document with sentences [1,2,3,4,5], a seg_size of 3, and a stride of 2, the document will be broken into segments [[1, 2, 3], [3, 4, 5], and [5]]. If the document's text is empty, a single segment containing the document's title is generated. Otherwise, the document's title is prepended to the document's text. :param documents: A list of Text objects, each of which corresponds to an indexed document. :param seg_size: The number of sentences each segment should contain. :param stride: The number of sentences to advance for the next segment. :return: A SegmentGroup containing all the documents' segments and the end index of each document in segmented_docs. """ segmented_docs, doc_end_indexes, end_idx = [], [0], 0 for document in documents: doc = self.nlp(document.text[:self.max_characters]) sentences = [sent.string.strip() for sent in doc.sents] # If the text is empty (i.e. there are no sentences), the segment_text is solely the title of the document. if len(sentences) == 0: segment_text = document.title segmented_docs.append( Text(segment_text, dict(docid=document.metadata["docid"]))) end_idx += 1 doc_end_indexes.append(int(end_idx)) else: for i in range(0, len(sentences), stride): segment_text = ' '.join(sentences[i:i + seg_size]) if document.title and (not document.title == ''): segment_text = document.title + '. ' + segment_text segmented_docs.append( Text(segment_text, dict(docid=document.metadata["docid"]))) if i + seg_size >= len(sentences): end_idx += i / stride + 1 doc_end_indexes.append(int(end_idx)) break return SegmentGroup(segmented_docs, doc_end_indexes)
def segment(self, documents: List[Text], seg_size: int, stride: int) -> SegmentGroup: segmented_doc, doc_end_indexes, end_idx = [], [0], 0 for document in documents: doc = self.nlp(document.text[:self.max_characters]) sentences = [sent.string.strip() for sent in doc.sents] for i in range(0, len(sentences), stride): segment_text = ' '.join(sentences[i:i + seg_size]) segmented_doc.append(Text(segment_text, dict(docid=document.raw["docid"]))) if i + seg_size >= len(sentences): end_idx += i/stride + 1 doc_end_indexes.append(int(end_idx)) break return SegmentGroup(segmented_doc, doc_end_indexes)
def Bert_Score(self, q, doc_dic_for_bert): chunk_scores = {} query = Query(q) texts = [Text(p[1], {'docid': p[0]}, 0) for p in doc_dic_for_bert] reranked = reranker.rerank(query, texts) reranked.sort(key=lambda x: x.score, reverse=True) for i in range(0, 10): chunk_text = reranked[i].text word_tokens = word_tokenize(chunk_text) filtered_sentence = [w for w in word_tokens if not w in stop_words] filtered_sentence = (" ").join(filtered_sentence).translate( str.maketrans('', '', string.punctuation)) chunk_scores[filtered_sentence] = round(reranked[i].score, 3) #print(f'{i+1:2} {reranked[i].score:.5f} {reranked[i].text}') return chunk_scores
def to_relevance_examples(self, index_path: str) -> List[RelevanceExample]: loader = Cord19AbstractLoader(index_path) example_map = {} for (qid, text, rel_cands), cands in tqdm(self.query_document_tuples()): if qid not in example_map: example_map[qid] = [convert_to_unicode(text), [], [], [], []] example_map[qid][1].append([cand for cand in cands][0]) try: passages = [loader.load_document(cand) for cand in cands] # Sometimes this abstract is empty. example_map[qid][2].append([ convert_to_unicode(passage.abstract) for passage in passages ][0]) example_map[qid][4].append([ convert_to_unicode(passage.title) for passage in passages ][0]) except ValueError as e: logging.error(e) logging.warning('Skipping passages') continue example_map[qid][3].append(cands[0] in rel_cands) mean_stats = defaultdict(list) for ex in self.examples: int_rels = np.array(list(map(int, example_map[ex.qid][3]))) p = int(int_rels.sum()) mean_stats['Expected P@1 for Random Ordering'].append( np.mean(int_rels)) n = len(ex.candidates) - p N = len(ex.candidates) if len(ex.candidates) <= 1000: mean_stats['Expected R@1000 for Random Ordering'].append( 1 if 1 in int_rels else 0) numer = np.array( [sp.comb(n, i) / (N - i) for i in range(0, n + 1) if i != N]) * p if n == N: numer = np.append(numer, 0) denom = np.array([sp.comb(N, i) for i in range(0, n + 1)]) rr = 1 / np.arange(1, n + 2) rmrr = np.sum(numer * rr / denom) mean_stats['Expected MRR for Random Ordering'].append(rmrr) rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10]) mean_stats['Expected MRR@10 for Random Ordering'].append(rmrr10) ex_index = len(ex.candidates) for rel_cand in ex.relevant_candidates: if rel_cand in ex.candidates: ex_index = min(ex.candidates.index(rel_cand), ex_index) mean_stats['Existing MRR'].append( 1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0) mean_stats['Existing MRR@10'].append(1 / (ex_index + 1) if ex_index < 10 else 0) for k, v in mean_stats.items(): logging.info(f'{k}: {np.mean(v)}') rel = [ RelevanceExample( Query(text=query_text, id=qid), list( map(lambda s: Text(s[1], dict(docid=s[0]), title=s[2]), zip(cands, cands_text, title))), rel_cands) for qid, (query_text, cands, cands_text, rel_cands, title) in example_map.items() ] return rel
n_docs += 1 doc_text = corpus[doc_id] doc = nlp(doc_text[:10000]) sentences = [str(sent).strip() for sent in doc.sents] for i in range(0, len(sentences), args.stride): segment = ' '.join(sentences[i:i + args.max_length]) passages.append([doc_id, segment]) n_segments += 1 if i + args.max_length >= len(sentences): break print(f'{query_id}: Reranking...') # Reranker using pygaggle query = Query(query_text) texts = [Text(p[1], {'docid': p[0]}, 0) for p in passages] start = time() ranked_results = reranker.rerank(query, texts) end = time() elapsed_time = end - start print("Time Elapsed: {:.1f}".format(elapsed_time)) # Get scores from reranker final_t5_scores = {} for result in ranked_results: if result.metadata["docid"] not in final_t5_scores: final_t5_scores[result.metadata["docid"]] = result.score else: if final_t5_scores[result.metadata["docid"]] < result.score: final_t5_scores[result.metadata["docid"]] = result.score
def rerank(self, query: str, texts: List[str]) -> List[float]: ranked_results = self.ranker.rerank(Query(query), [Text(t) for t in texts]) scores = [r.score for r in ranked_results] return scores
def main(): apb = ArgumentParserBuilder() apb.add_opts( opt('--task', type=str, default='wikipedia'), opt('--method', type=str, required=True, choices=METHOD_CHOICES), opt('--retrieval-file', type=Path, required=True, help= 'JSON file containing top passages selected by the retrieval model' ), opt('--model-name', type=str, default='facebook/dpr-reader-single-nq-base', help='Pretrained model for reader'), opt('--tokenizer-name', type=str, default='facebook/dpr-reader-single-nq-base', help='Pretrained model for tokenizer'), opt('--num-spans', type=int, default=1, help='Number of answer spans to return'), opt('--max-answer-length', type=int, default=10, help='Maximum length that an answer span can be'), opt('--num-spans-per-passage', type=int, default=10, help='Maximum number of answer spans to return per passage'), opt('--output-file', type=Path, default=None, help= 'File to output predictions for each example; if not specified, this output will be discarded' ), opt('--device', type=str, default='cuda:0', help='Device for model computations'), opt('--batch-size', type=int, default=16, help='batch size of reader inference'), opt('--topk-retrieval', type=int, default=[], nargs='+', help='Values of k to print the topk accuracy of the retrieval file' ), opt('--topk-em', type=int, default=[50], nargs='+', help='Values of k to print the topk exact match score'), ) args = apb.parser.parse_args() options = PassageReadingEvaluationOptions(**vars(args)) logging.info("Loading the Retrieval File") with open(options.retrieval_file) as f: data = json.load(f) if args.topk_retrieval: logging.info("Evaluating Topk Retrieval Accuracies") subprocess.call([ 'python', 'tools/scripts/dpr/evaluate_retrieval.py', '--retrieval', options.retrieval_file, '--topk', *map(str, args.topk_retrieval) ]) logging.info("Loading Reader Model and Tokenizer") construct_map = dict(dpr=construct_dpr, ) reader = construct_map[options.method](options) evaluator = ReaderEvaluator(reader) max_topk_passages = max(options.topk_em) examples = [] for _, item in data.items(): examples.append( RetrievalExample( query=Query(text=item["question"]), texts=list( map( lambda context: Text(text=context["text"].split( '\n', 1)[1].replace('""', '"'), title=context["text"].split( '\n', 1)[0].replace('"', '')), item["contexts"]))[:max_topk_passages], ground_truth_answers=item["answers"], )) dpr_predictions = [] if args.output_file is not None else None ems = evaluator.evaluate(examples, options.topk_em, dpr_predictions) logging.info('Reader completed') for k in options.topk_em: em = np.mean(np.array(ems[k])) * 100. logging.info(f'Top{k}\tExact Match Accuracy: {em}') if args.output_file is not None: with open(args.output_file, 'w', encoding='utf-8', newline='\n') as f: json.dump(dpr_predictions, f, indent=4)