Пример #1
0
 def to_relevance_examples(self,
                           index_path: str,
                           is_duo: bool = False) -> List[RelevanceExample]:
     loader = MsMarcoPassageLoader(index_path)
     example_map = {}
     for (qid, text, rel_cands), cands in self.query_passage_tuples():
         if qid not in example_map:
             example_map[qid] = [convert_to_unicode(text), [], [], []]
         example_map[qid][1].append([cand for cand in cands][0])
         try:
             passages = [loader.load_passage(cand) for cand in cands]
             example_map[qid][2].append([
                 convert_to_unicode(passage.all_text)
                 for passage in passages
             ][0])
         except ValueError:
             logging.warning(f'Skipping {passages}')
             continue
         example_map[qid][3].append(cands[0] in rel_cands)
     mean_stats = defaultdict(list)
     for ex in self.examples:
         int_rels = np.array(list(map(int, example_map[ex.qid][3])))
         p = int_rels.sum() / (len(ex.candidates) -
                               1) if is_duo else int_rels.sum()
         mean_stats['Random P@1'].append(np.mean(int_rels))
         n = len(ex.candidates) - p
         N = len(ex.candidates)
         if len(ex.candidates) <= 1000:
             mean_stats['Random R@1000'].append(1 if 1 in int_rels else 0)
         numer = np.array(
             [sp.comb(n, i) / (N - i)
              for i in range(0, n + 1) if i != N]) * p
         if n == N:
             numer = np.append(numer, 0)
         denom = np.array([sp.comb(N, i) for i in range(0, n + 1)])
         rr = 1 / np.arange(1, n + 2)
         rmrr = np.sum(numer * rr / denom)
         mean_stats['Random MRR'].append(rmrr)
         rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10])
         mean_stats['Random MRR@10'].append(rmrr10)
         ex_index = len(ex.candidates)
         for rel_cand in ex.relevant_candidates:
             if rel_cand in ex.candidates:
                 ex_index = min(ex.candidates.index(rel_cand), ex_index)
         mean_stats['Existing MRR'].append(
             1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0)
         mean_stats['Existing MRR@10'].append(1 /
                                              (ex_index +
                                               1) if ex_index < 10 else 0)
     for k, v in mean_stats.items():
         logging.info(f'{k}: {np.mean(v)}')
     return [
         RelevanceExample(
             Query(text=query_text, id=qid),
             list(
                 map(lambda s: Text(s[1], dict(docid=s[0])),
                     zip(cands, cands_text))), rel_cands)
         for qid, (query_text, cands, cands_text,
                   rel_cands) in example_map.items()
     ]
Пример #2
0
    def to_relevance_examples(self, index_path: str) -> List[RelevanceExample]:
        loader = Cord19AbstractLoader(index_path)
        example_map = {}
        for (qid, text,
             rel_cands), cands in tqdm(self.query_document_tuples()):
            if qid not in example_map:
                example_map[qid] = [convert_to_unicode(text), [], [], [], []]
            example_map[qid][1].append([cand for cand in cands][0])
            try:
                passages = [loader.load_document(cand) for cand in cands]
                # Sometimes this abstract is empty.
                example_map[qid][2].append([
                    convert_to_unicode(passage.abstract)
                    for passage in passages
                ][0])
                example_map[qid][4].append([
                    convert_to_unicode(passage.title) for passage in passages
                ][0])
            except ValueError as e:
                logging.error(e)
                logging.warning('Skipping passages')
                continue
            example_map[qid][3].append(cands[0] in rel_cands)
        mean_stats = defaultdict(list)

        for ex in self.examples:
            int_rels = np.array(list(map(int, example_map[ex.qid][3])))
            p = int(int_rels.sum())
            mean_stats['Expected P@1 for Random Ordering'].append(
                np.mean(int_rels))
            n = len(ex.candidates) - p
            N = len(ex.candidates)
            if len(ex.candidates) <= 1000:
                mean_stats['Expected R@1000 for Random Ordering'].append(
                    1 if 1 in int_rels else 0)
            numer = np.array(
                [sp.comb(n, i) / (N - i)
                 for i in range(0, n + 1) if i != N]) * p
            if n == N:
                numer = np.append(numer, 0)
            denom = np.array([sp.comb(N, i) for i in range(0, n + 1)])
            rr = 1 / np.arange(1, n + 2)
            rmrr = np.sum(numer * rr / denom)
            mean_stats['Expected MRR for Random Ordering'].append(rmrr)
            rmrr10 = np.sum(numer[:10] * rr[:10] / denom[:10])
            mean_stats['Expected MRR@10 for Random Ordering'].append(rmrr10)
            ex_index = len(ex.candidates)
            for rel_cand in ex.relevant_candidates:
                if rel_cand in ex.candidates:
                    ex_index = min(ex.candidates.index(rel_cand), ex_index)
            mean_stats['Existing MRR'].append(
                1 / (ex_index + 1) if ex_index < len(ex.candidates) else 0)
            mean_stats['Existing MRR@10'].append(1 /
                                                 (ex_index +
                                                  1) if ex_index < 10 else 0)
        for k, v in mean_stats.items():
            logging.info(f'{k}: {np.mean(v)}')
        rel = [
            RelevanceExample(
                Query(text=query_text, id=qid),
                list(
                    map(lambda s: Text(s[1], dict(docid=s[0]), title=s[2]),
                        zip(cands, cands_text, title))), rel_cands)
            for qid, (query_text, cands, cands_text, rel_cands,
                      title) in example_map.items()
        ]
        return rel