def to_relevance_examples(self, index_path: str, is_duo: bool = False) -> List[RelevanceExample]: loader = MsMarcoPassageLoader(index_path) example_map = {} for (qid, text, rel_cands), cands in self.query_passage_tuples(): if qid not in example_map: example_map[qid] = [convert_to_unicode(text), [], [], []] example_map[qid][1].append([cand for cand in cands][0]) try: passages = [loader.load_passage(cand) for cand in cands] example_map[qid][2].append([ convert_to_unicode(passage.all_text) for passage in passages ][0]) except ValueError: logging.warning(f'Skipping {passages}') continue example_map[qid][3].append(cands[0] in rel_cands) mean_stats = defaultdict(list) for ex in self.examples: int_rels = np.array(list(map(int, example_map[ex.qid][3]))) p = int_rels.sum() / (len(ex.candidates) - 1) if is_duo else int_rels.sum() mean_stats['Random P@1'].append(np.mean(int_rels)) n = len(ex.candidates) - p N = len(ex.candidates) if len(ex.candidates) <= 1000: mean_stats['Random R@1000'].append(1 if 1 in int_rels else 0) numer = np.array( [sp.comb(n, i) / (N - i) for i in range(0, n + 1) if i != N]) * p if n == N: numer = np.append(numer, 0) denom = np.array([sp.comb(N, i) for i in range(0, n + 1)]) rr = 1 / np.arange(1, n + 2) rmrr = np.sum(numer * rr / denom) mean_stats['Random MRR'].append(rmrr) rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10]) mean_stats['Random MRR@10'].append(rmrr10) ex_index = len(ex.candidates) for rel_cand in ex.relevant_candidates: if rel_cand in ex.candidates: ex_index = min(ex.candidates.index(rel_cand), ex_index) mean_stats['Existing MRR'].append( 1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0) mean_stats['Existing MRR@10'].append(1 / (ex_index + 1) if ex_index < 10 else 0) for k, v in mean_stats.items(): logging.info(f'{k}: {np.mean(v)}') return [ RelevanceExample( Query(text=query_text, id=qid), list( map(lambda s: Text(s[1], dict(docid=s[0])), zip(cands, cands_text))), rel_cands) for qid, (query_text, cands, cands_text, rel_cands) in example_map.items() ]
def to_senticized_dataset(self, index_path: str, split: str = 'nq') -> List[RelevanceExample]: loader = Cord19DocumentLoader(index_path) tokenizer = SpacySenticizer() example_map = OrderedDict() rel_map = OrderedDict() for query, document in self.query_answer_pairs(split=split): if document.id == MISSING_ID: logging.warning(f'Skipping {document.title} (missing ID)') continue key = (query, document.id) try: doc = loader.load_document(document.id) example_map.setdefault(key, tokenizer(doc.all_text)) except ValueError as e: logging.warning(f'Skipping {document.id} ({e})') continue sents = example_map[key] rel_map.setdefault(key, [False] * len(sents)) for idx, s in enumerate(sents): if document.exact_answer in s: rel_map[key][idx] = True mean_stats = defaultdict(list) for (_, doc_id), rels in rel_map.items(): int_rels = np.array(list(map(int, rels))) p = int_rels.sum() mean_stats['Average spans'].append(p) mean_stats['Expected P@1 for Random Ordering'].append( np.mean(int_rels)) n = len(int_rels) - p N = len(int_rels) mean_stats['Expected R@3 for Random Ordering'].append(1 - (n * (n - 1) * (n - 2)) / (N * (N - 1) * (N - 2))) numer = np.array( [sp.comb(n, i) / (N - i) for i in range(0, n + 1)]) * p denom = np.array([sp.comb(N, i) for i in range(0, n + 1)]) rr = 1 / np.arange(1, n + 2) rmrr = np.sum(numer * rr / denom) mean_stats['Expected MRR for Random Ordering'].append(rmrr) if not any(rels): logging.warning(f'{doc_id} has no relevant answers') for k, v in mean_stats.items(): logging.info(f'{k}: {np.mean(v)}') return [ RelevanceExample( Query(query), list(map(lambda s: Text(s, dict(docid=docid)), sents)), rels) for ((query, docid), sents), (_, rels) in zip(example_map.items(), rel_map.items()) ]
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_name_or_path", default='unicamp-dl/mt5-base-multi-msmarco', type=str, required=False, help="Reranker model.") parser.add_argument("--initial_run", default=None, type=str, required=True, help="Initial run to be reranked.") parser.add_argument("--corpus", default=None, type=str, required=True, help="Document collection.") parser.add_argument("--output_run", default=None, type=str, required=True, help="Path to save the reranked run.") parser.add_argument("--queries", default=None, type=str, required=True, help="Path to the queries file.") args = parser.parse_args() model = MonoT5(args.model_name_or_path) run = load_run(args.initial_run) corpus = load_corpus(args.corpus) queries = load_queries(args.queries) # Run reranker trec = open(args.output_run + '-trec.txt', 'w') marco = open(args.output_run + '-marco.txt', 'w') for idx, query_id in enumerate(tqdm(run.keys())): query = Query(queries[query_id]) texts = [ Text(corpus[doc_id], {'docid': doc_id}, 0) for doc_id in run[query_id] ] reranked = model.rerank(query, texts) for rank, document in enumerate(reranked): trec.write( f'{query_id}\tQ0\t{document.metadata["docid"]}\t{rank+1}\t{document.score}\t{args.model_name_or_path}\n' ) marco.write( f'{query_id}\t{document.metadata["docid"]}\t{rank+1}\n') trec.close() marco.close() print("Done!")
def rerank(self, query, hits): if self.reranker is None: logging.info("Reranker not available, skipping reranking") return hits reranked = self.reranker.rerank(Query(query), hits_to_texts(hits)) reranked_scores = [r.score for r in reranked] # Reorder hits with reranker scores reranked = list(zip(hits, reranked_scores)) reranked.sort(key=lambda x: x[1], reverse=True) reranked_hits = [r[0] for r in reranked] return reranked_hits
def main(output_path=OUTPUT_PATH, index_path=INDEX_PATH, queries_path=QUERIES_PATH, run=RUN, k=K): print('################################################') print("##### Performing Passage Ranking using L2R #####") print('################################################') print("Output will be placed in:", output_path, ", format used will be TREC") print('Loading pre-trained model MonoT5...') from pygaggle.rerank.transformer import MonoT5 reranker = MonoT5() print('Fetching anserini-like indices from:', index_path) # fetch some passages to rerank from MS MARCO with Pyserini (BM25) searcher = SimpleSearcher(index_path) print('Loading queries from:', queries_path) with open(queries_path, 'r') as f: content = f.readlines() content = [x.strip().split('\t') for x in content] queries = [Query(x[1], x[0]) for x in content] print(f'Ranking queries using BM25 (k={k})') queries_text = [] for query in tqdm(queries): hits = searcher.search(query.text, k=K) texts = hits_to_texts(hits) queries_text.append(texts) print('Reranking all queries using MonoT5!') rankings = [] for (i, query) in enumerate(tqdm(queries)): reranked = reranker.rerank(query, queries_text[i]) reranked.sort(key=lambda x: x.score, reverse=True) rankings.append(reranked) print('Outputting to file...') if '.tsv' in output_path: output_to_tsv(queries, rankings, run, output_path) elif '.csv' in output_path: output_to_csv(queries, rankings, run, output_path) else: print( 'ERROR: invalid output file format provided, please use either .csv or .tsv. Exiting' ) sys.exit(1) print('SUCCESS: completed reranking, you may check the output at:', output_path) sys.exit(0)
def Bert_Score(self, q, doc_dic_for_bert): chunk_scores = {} query = Query(q) texts = [Text(p[1], {'docid': p[0]}, 0) for p in doc_dic_for_bert] reranked = reranker.rerank(query, texts) reranked.sort(key=lambda x: x.score, reverse=True) for i in range(0, 10): chunk_text = reranked[i].text word_tokens = word_tokenize(chunk_text) filtered_sentence = [w for w in word_tokens if not w in stop_words] filtered_sentence = (" ").join(filtered_sentence).translate( str.maketrans('', '', string.punctuation)) chunk_scores[filtered_sentence] = round(reranked[i].score, 3) #print(f'{i+1:2} {reranked[i].score:.5f} {reranked[i].text}') return chunk_scores
def test_basic(self): hits = self.searcher.search('information retrieval') self.assertTrue(isinstance(hits, List)) self.assertTrue(isinstance(hits[0], JSimpleSearcherResult)) self.assertEqual('CACM-3134', hits[0].docid) self.assertEqual(3133, hits[0].lucene_docid) self.assertEqual(1500, len(hits[0].contents)) self.assertEqual(1532, len(hits[0].raw)) self.assertAlmostEqual(4.76550, hits[0].score, places=5) texts = hits_to_texts(hits) self.assertEqual(len(hits), len(texts)) self.assertTrue(isinstance(texts, List)) self.assertTrue(isinstance(texts[0], Text)) for i in range(0, len(hits)): self.assertEqual(hits[i].raw, texts[i].text) self.assertAlmostEqual(hits[i].score, texts[i].score, places=5) query = Query('dummy query') identity_reranker = IdentityReranker() self.assertTrue(isinstance(identity_reranker, Reranker)) output = identity_reranker.rerank(query, texts) # Check that reranked output is indeed the same as the input for i in range(0, len(hits)): self.assertEqual(texts[i].text, output[i].text) self.assertEqual(texts[i].metadata, output[i].metadata) self.assertAlmostEqual(texts[i].score, output[i].score, places=5) # Check that the identity rerank was not destructive texts = [] for i in range(0, len(hits)): self.assertEqual(hits[i].raw, output[i].text) self.assertAlmostEqual(hits[i].score, output[i].score, places=5)
def to_relevance_examples(self, index_path: str) -> List[RelevanceExample]: loader = Cord19AbstractLoader(index_path) example_map = {} for (qid, text, rel_cands), cands in tqdm(self.query_document_tuples()): if qid not in example_map: example_map[qid] = [convert_to_unicode(text), [], [], [], []] example_map[qid][1].append([cand for cand in cands][0]) try: passages = [loader.load_document(cand) for cand in cands] # Sometimes this abstract is empty. example_map[qid][2].append([ convert_to_unicode(passage.abstract) for passage in passages ][0]) example_map[qid][4].append([ convert_to_unicode(passage.title) for passage in passages ][0]) except ValueError as e: logging.error(e) logging.warning('Skipping passages') continue example_map[qid][3].append(cands[0] in rel_cands) mean_stats = defaultdict(list) for ex in self.examples: int_rels = np.array(list(map(int, example_map[ex.qid][3]))) p = int(int_rels.sum()) mean_stats['Expected P@1 for Random Ordering'].append( np.mean(int_rels)) n = len(ex.candidates) - p N = len(ex.candidates) if len(ex.candidates) <= 1000: mean_stats['Expected R@1000 for Random Ordering'].append( 1 if 1 in int_rels else 0) numer = np.array( [sp.comb(n, i) / (N - i) for i in range(0, n + 1) if i != N]) * p if n == N: numer = np.append(numer, 0) denom = np.array([sp.comb(N, i) for i in range(0, n + 1)]) rr = 1 / np.arange(1, n + 2) rmrr = np.sum(numer * rr / denom) mean_stats['Expected MRR for Random Ordering'].append(rmrr) rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10]) mean_stats['Expected MRR@10 for Random Ordering'].append(rmrr10) ex_index = len(ex.candidates) for rel_cand in ex.relevant_candidates: if rel_cand in ex.candidates: ex_index = min(ex.candidates.index(rel_cand), ex_index) mean_stats['Existing MRR'].append( 1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0) mean_stats['Existing MRR@10'].append(1 / (ex_index + 1) if ex_index < 10 else 0) for k, v in mean_stats.items(): logging.info(f'{k}: {np.mean(v)}') rel = [ RelevanceExample( Query(text=query_text, id=qid), list( map(lambda s: Text(s[1], dict(docid=s[0]), title=s[2]), zip(cands, cands_text, title))), rel_cands) for qid, (query_text, cands, cands_text, rel_cands, title) in example_map.items() ] return rel
continue n_docs += 1 doc_text = corpus[doc_id] doc = nlp(doc_text[:10000]) sentences = [str(sent).strip() for sent in doc.sents] for i in range(0, len(sentences), args.stride): segment = ' '.join(sentences[i:i + args.max_length]) passages.append([doc_id, segment]) n_segments += 1 if i + args.max_length >= len(sentences): break print(f'{query_id}: Reranking...') # Reranker using pygaggle query = Query(query_text) texts = [Text(p[1], {'docid': p[0]}, 0) for p in passages] start = time() ranked_results = reranker.rerank(query, texts) end = time() elapsed_time = end - start print("Time Elapsed: {:.1f}".format(elapsed_time)) # Get scores from reranker final_t5_scores = {} for result in ranked_results: if result.metadata["docid"] not in final_t5_scores: final_t5_scores[result.metadata["docid"]] = result.score else: if final_t5_scores[result.metadata["docid"]] < result.score: final_t5_scores[result.metadata["docid"]] = result.score
def rerank(self, query: str, texts: List[str]) -> List[float]: ranked_results = self.ranker.rerank(Query(query), [Text(t) for t in texts]) scores = [r.score for r in ranked_results] return scores
def main(): apb = ArgumentParserBuilder() apb.add_opts( opt('--task', type=str, default='wikipedia'), opt('--method', type=str, required=True, choices=METHOD_CHOICES), opt('--retrieval-file', type=Path, required=True, help= 'JSON file containing top passages selected by the retrieval model' ), opt('--model-name', type=str, default='facebook/dpr-reader-single-nq-base', help='Pretrained model for reader'), opt('--tokenizer-name', type=str, default='facebook/dpr-reader-single-nq-base', help='Pretrained model for tokenizer'), opt('--num-spans', type=int, default=1, help='Number of answer spans to return'), opt('--max-answer-length', type=int, default=10, help='Maximum length that an answer span can be'), opt('--num-spans-per-passage', type=int, default=10, help='Maximum number of answer spans to return per passage'), opt('--output-file', type=Path, default=None, help= 'File to output predictions for each example; if not specified, this output will be discarded' ), opt('--device', type=str, default='cuda:0', help='Device for model computations'), opt('--batch-size', type=int, default=16, help='batch size of reader inference'), opt('--topk-retrieval', type=int, default=[], nargs='+', help='Values of k to print the topk accuracy of the retrieval file' ), opt('--topk-em', type=int, default=[50], nargs='+', help='Values of k to print the topk exact match score'), ) args = apb.parser.parse_args() options = PassageReadingEvaluationOptions(**vars(args)) logging.info("Loading the Retrieval File") with open(options.retrieval_file) as f: data = json.load(f) if args.topk_retrieval: logging.info("Evaluating Topk Retrieval Accuracies") subprocess.call([ 'python', 'tools/scripts/dpr/evaluate_retrieval.py', '--retrieval', options.retrieval_file, '--topk', *map(str, args.topk_retrieval) ]) logging.info("Loading Reader Model and Tokenizer") construct_map = dict(dpr=construct_dpr, ) reader = construct_map[options.method](options) evaluator = ReaderEvaluator(reader) max_topk_passages = max(options.topk_em) examples = [] for _, item in data.items(): examples.append( RetrievalExample( query=Query(text=item["question"]), texts=list( map( lambda context: Text(text=context["text"].split( '\n', 1)[1].replace('""', '"'), title=context["text"].split( '\n', 1)[0].replace('"', '')), item["contexts"]))[:max_topk_passages], ground_truth_answers=item["answers"], )) dpr_predictions = [] if args.output_file is not None else None ems = evaluator.evaluate(examples, options.topk_em, dpr_predictions) logging.info('Reader completed') for k in options.topk_em: em = np.mean(np.array(ems[k])) * 100. logging.info(f'Top{k}\tExact Match Accuracy: {em}') if args.output_file is not None: with open(args.output_file, 'w', encoding='utf-8', newline='\n') as f: json.dump(dpr_predictions, f, indent=4)
print(len(query_dict)) pass_dict = {} with open(f"{args.input_file}.trec_json") as f: for line in f: a, b = line.strip().split("indri #") qid, _, docid, _, _ = a.strip().split() if qid not in pass_dict: pass_dict[qid] = {} temp = json.loads(b) pass_dict[qid][docid] = temp["doc"][ "body"] # + " " + temp["doc"]["title"] print(len(pass_dict)) lines = [] for qid in tqdm(query_dict): query = Query(query_dict[qid]) passages = pass_dict[qid].items() texts = [Text(p[1], {'docid': p[0]}, 0) for p in passages] reranked = reranker.rerank(query, texts) reranked.sort(key=lambda x: x.score, reverse=True) for i, res in enumerate(reranked, start=1): docid = res.raw["docid"] score = res.score lines.append(f"{qid} Q0 {docid} {i} {score} bert\n") with open(f"{args.input_file}_{args.query_field}_t5", "w") as f: f.write("".join(lines))