def to_relevance_examples(self, index_path: str, is_duo: bool = False) -> List[RelevanceExample]: loader = MsMarcoPassageLoader(index_path) example_map = {} for (qid, text, rel_cands), cands in self.query_passage_tuples(): if qid not in example_map: example_map[qid] = [convert_to_unicode(text), [], [], []] example_map[qid][1].append([cand for cand in cands][0]) try: passages = [loader.load_passage(cand) for cand in cands] example_map[qid][2].append([ convert_to_unicode(passage.all_text) for passage in passages ][0]) except ValueError: logging.warning(f'Skipping {passages}') continue example_map[qid][3].append(cands[0] in rel_cands) mean_stats = defaultdict(list) for ex in self.examples: int_rels = np.array(list(map(int, example_map[ex.qid][3]))) p = int_rels.sum() / (len(ex.candidates) - 1) if is_duo else int_rels.sum() mean_stats['Random P@1'].append(np.mean(int_rels)) n = len(ex.candidates) - p N = len(ex.candidates) if len(ex.candidates) <= 1000: mean_stats['Random R@1000'].append(1 if 1 in int_rels else 0) numer = np.array( [sp.comb(n, i) / (N - i) for i in range(0, n + 1) if i != N]) * p if n == N: numer = np.append(numer, 0) denom = np.array([sp.comb(N, i) for i in range(0, n + 1)]) rr = 1 / np.arange(1, n + 2) rmrr = np.sum(numer * rr / denom) mean_stats['Random MRR'].append(rmrr) rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10]) mean_stats['Random MRR@10'].append(rmrr10) ex_index = len(ex.candidates) for rel_cand in ex.relevant_candidates: if rel_cand in ex.candidates: ex_index = min(ex.candidates.index(rel_cand), ex_index) mean_stats['Existing MRR'].append( 1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0) mean_stats['Existing MRR@10'].append(1 / (ex_index + 1) if ex_index < 10 else 0) for k, v in mean_stats.items(): logging.info(f'{k}: {np.mean(v)}') return [ RelevanceExample( Query(text=query_text, id=qid), list( map(lambda s: Text(s[1], dict(docid=s[0])), zip(cands, cands_text))), rel_cands) for qid, (query_text, cands, cands_text, rel_cands) in example_map.items() ]
def to_relevance_examples(self, index_path: str) -> List[RelevanceExample]: loader = Cord19AbstractLoader(index_path) example_map = {} for (qid, text, rel_cands), cands in tqdm(self.query_document_tuples()): if qid not in example_map: example_map[qid] = [convert_to_unicode(text), [], [], [], []] example_map[qid][1].append([cand for cand in cands][0]) try: passages = [loader.load_document(cand) for cand in cands] # Sometimes this abstract is empty. example_map[qid][2].append([ convert_to_unicode(passage.abstract) for passage in passages ][0]) example_map[qid][4].append([ convert_to_unicode(passage.title) for passage in passages ][0]) except ValueError as e: logging.error(e) logging.warning('Skipping passages') continue example_map[qid][3].append(cands[0] in rel_cands) mean_stats = defaultdict(list) for ex in self.examples: int_rels = np.array(list(map(int, example_map[ex.qid][3]))) p = int(int_rels.sum()) mean_stats['Expected P@1 for Random Ordering'].append( np.mean(int_rels)) n = len(ex.candidates) - p N = len(ex.candidates) if len(ex.candidates) <= 1000: mean_stats['Expected R@1000 for Random Ordering'].append( 1 if 1 in int_rels else 0) numer = np.array( [sp.comb(n, i) / (N - i) for i in range(0, n + 1) if i != N]) * p if n == N: numer = np.append(numer, 0) denom = np.array([sp.comb(N, i) for i in range(0, n + 1)]) rr = 1 / np.arange(1, n + 2) rmrr = np.sum(numer * rr / denom) mean_stats['Expected MRR for Random Ordering'].append(rmrr) rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10]) mean_stats['Expected MRR@10 for Random Ordering'].append(rmrr10) ex_index = len(ex.candidates) for rel_cand in ex.relevant_candidates: if rel_cand in ex.candidates: ex_index = min(ex.candidates.index(rel_cand), ex_index) mean_stats['Existing MRR'].append( 1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0) mean_stats['Existing MRR@10'].append(1 / (ex_index + 1) if ex_index < 10 else 0) for k, v in mean_stats.items(): logging.info(f'{k}: {np.mean(v)}') rel = [ RelevanceExample( Query(text=query_text, id=qid), list( map(lambda s: Text(s[1], dict(docid=s[0]), title=s[2]), zip(cands, cands_text, title))), rel_cands) for qid, (query_text, cands, cands_text, rel_cands, title) in example_map.items() ] return rel