Пример #1
0
    def test_qrels_bad_line(self):
        mock_file = StringFile('''
Q0 0 D1 3
Q0 1 D2   2

Q0 0\tD3 3
Q0 1 D2 1
BAD LINE
Q1 0 D2 1
'''.lstrip())

        QREL_DEFS = {}

        qrels = TrecQrels(mock_file, QREL_DEFS)
        with self.assertRaises(RuntimeError):
            list(qrels.qrels_iter())
Пример #2
0
    def test_qrels(self):
        mock_file = StringFile('''
Q0 0 D1 3
Q0 1 D2   2

Q0 0\tD3 3
Q0 1 D2 1
Q1 0 D2 1
'''.lstrip())
        QREL_DEFS = {}
        expected_results = [
            TrecQrel('Q0', 'D1', 3, '0'),
            TrecQrel('Q0', 'D2', 2, '1'),
            TrecQrel('Q0', 'D3', 3, '0'),
            TrecQrel('Q0', 'D2', 1, '1'),
            TrecQrel('Q1', 'D2', 1, '0'),
        ]

        qrels = TrecQrels(mock_file, QREL_DEFS)
        self.assertEqual(qrels.qrels_path(), 'MOCK')
        self.assertEqual(qrels.qrels_defs(), QREL_DEFS)
        self.assertEqual(list(qrels.qrels_iter()), expected_results)
Пример #3
0
def _init():
    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
    base_path = ir_datasets.util.home_path() / NAME
    dlc = DownloadConfig.context(NAME, base_path)

    subsets = {}

    train_qrels = ir_datasets.registry['msmarco-passage/train'].qrels_handler()
    train_docparis = TsvDocPairs(dlc['train/triples'])
    dev_qrels = TrecQrels(dlc['dev/qrels'], QRELS_DEFS)
    dev_small_qrels = TrecQrels(dlc['dev/qrels-small'], QRELS_DEFS)
    small_dev_qids = Lazy(
        lambda: {q.query_id
                 for q in dev_small_qrels.qrels_iter()})

    for lang in ['es', 'fr', 'pt', 'it', 'id', 'de', 'ru', 'zh']:
        collection = TsvDocs(
            dlc[f'{lang}/docs'],
            namespace=f'mmarco/{lang}',
            lang=lang,
            count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}'))
        subsets[f'{lang}'] = Dataset(collection, documentation(f'{lang}'))
        subsets[f'{lang}/train'] = Dataset(
            collection,
            TsvQueries(dlc[f'{lang}/queries/train'],
                       namespace=f'mmarco/{lang}',
                       lang=lang), train_qrels, train_docparis,
            documentation(f'{lang}/train'))
        subsets[f'{lang}/dev'] = Dataset(
            collection,
            TsvQueries(dlc[f'{lang}/queries/dev'],
                       namespace=f'mmarco/{lang}',
                       lang=lang), dev_qrels, documentation(f'{lang}/dev'))
        subsets[f'{lang}/dev/small'] = Dataset(
            collection,
            FilteredQueries(subsets[f'{lang}/dev'].queries_handler(),
                            small_dev_qids,
                            mode='include'), dev_small_qrels,
            TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev'])
            if lang not in ('zh', 'pt') else None,
            documentation(f'{lang}/dev/small'))
        if lang in ('zh', 'pt'):
            subsets[f'{lang}/dev/v1.1'] = Dataset(
                collection,
                TsvQueries(dlc[f'{lang}/queries/dev/v1.1'],
                           namespace=f'mmarco/{lang}',
                           lang=lang), dev_qrels,
                documentation(f'{lang}/dev/v1.1'))
            subsets[f'{lang}/dev/small/v1.1'] = Dataset(
                collection,
                FilteredQueries(subsets[f'{lang}/dev/v1.1'].queries_handler(),
                                small_dev_qids,
                                mode='include'), dev_small_qrels,
                TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev/v1.1']),
                documentation(f'{lang}/dev/v1.1'))
        if lang in ('pt', ):
            subsets[f'{lang}/train/v1.1'] = Dataset(
                collection,
                TsvQueries(dlc[f'{lang}/queries/train/v1.1'],
                           namespace=f'mmarco/{lang}',
                           lang=lang), train_qrels, train_docparis,
                documentation(f'{lang}/train/v1.1'))

    for lang in [
            'ar', 'zh', 'dt', 'fr', 'de', 'hi', 'id', 'it', 'ja', 'pt', 'ru',
            'es', 'vi'
    ]:
        collection = TsvDocs(
            dlc[f'v2/{lang}/docs'],
            namespace=f'mmarco/{lang}',
            lang=lang,
            count_hint=ir_datasets.util.count_hint(f'{NAME}/v2/{lang}'))
        subsets[f'v2/{lang}'] = Dataset(collection,
                                        documentation(f'v2/{lang}'))
        subsets[f'v2/{lang}/train'] = Dataset(
            collection,
            TsvQueries(dlc[f'v2/{lang}/queries/train'],
                       namespace=f'mmarco/v2/{lang}',
                       lang=lang), train_qrels, train_docparis,
            documentation(f'v2/{lang}/train'))
        subsets[f'v2/{lang}/dev'] = Dataset(
            collection,
            TsvQueries(dlc[f'v2/{lang}/queries/dev'],
                       namespace=f'v2/mmarco/{lang}',
                       lang=lang), dev_qrels, documentation(f'v2/{lang}/dev'))
        subsets[f'v2/{lang}/dev/small'] = Dataset(
            collection,
            FilteredQueries(subsets[f'v2/{lang}/dev'].queries_handler(),
                            small_dev_qids,
                            mode='include'), dev_small_qrels,
            TrecScoredDocs(dlc[f'v2/{lang}/scoreddocs/dev'],
                           negate_score=True),
            documentation(f'v2/{lang}/dev/small'))

    ir_datasets.registry.register(NAME, Dataset(documentation('_')))
    for s in sorted(subsets):
        ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

    return collection, subsets