def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = NytDocs(dlc['source']) base = Dataset(collection, documentation('_')) all_queries = NytQueries(collection) all_qrels = NytQrels(collection) match_qids = Lazy(lambda: VALID_IDS) subsets['train'] = Dataset( FilteredQueries(all_queries, match_qids, mode='exclude'), FilteredQrels(all_qrels, match_qids, mode='exclude'), collection, documentation('train')) subsets['valid'] = Dataset( FilteredQueries(all_queries, match_qids, mode='include'), FilteredQrels(all_qrels, match_qids, mode='include'), collection, documentation('valid')) ir_datasets.registry.register('nyt', base) for s in sorted(subsets): ir_datasets.registry.register(f'nyt/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = TrecDocs(dlc['docs'], path_globs=[ '**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*', '**/LATIMES/LA*' ], namespace=NAME, lang='en', expected_file_count=2295, count_hint=ir_datasets.util.count_hint(NAME)) queries = TrecQueries(GzipExtract(dlc['queries']), namespace=NAME, lang='en') qrels = TrecQrels(dlc['qrels'], QREL_DEFS) base = Dataset(collection, queries, qrels, documentation('_')) for fold in FOLDS: qid_filter = make_filter(fold) subsets[fold] = Dataset(FilteredQueries(queries, qid_filter), FilteredQrels(qrels, qid_filter), collection, documentation(fold)) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'nyt.tgz.pklz4'], message='Migrating nyt (extracting body text)') collection = migrator(NytDocs(dlc['source'])) base = Dataset(collection, documentation('_')) # core17 subsets['trec-core-2017'] = Dataset( TrecQueries(dlc['trec-core-2017/queries'], namespace='trec-core-2017', lang='en'), TrecQrels(dlc['trec-core-2017/qrels'], CORE_QREL_DEFS), collection, documentation('trec-core-2017')) # wksup all_queries = NytQueries(collection) all_qrels = NytQrels(collection) match_qids = Lazy(lambda: VALID_IDS) subsets['wksup'] = Dataset( all_queries, all_qrels, collection, documentation('wksup/train')) subsets['wksup/train'] = Dataset( FilteredQueries(all_queries, match_qids, mode='exclude'), FilteredQrels(all_qrels, match_qids, mode='exclude'), collection, documentation('wksup/train')) subsets['wksup/valid'] = Dataset( FilteredQueries(all_queries, match_qids, mode='include'), FilteredQrels(all_qrels, match_qids, mode='include'), collection, documentation('wksup/valid')) ir_datasets.registry.register('nyt', base) for s in sorted(subsets): ir_datasets.registry.register(f'nyt/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'msmarco_v2_passage.tar.pklz4'], message='Cleaning up pklz4 lookup structure in favor of ID-based lookups') collection = MsMarcoV2Passages(dlc['passages']) collection = migrator(collection) qrels_migrator = Migrator(base_path/'qrels_version.txt', 'v2', affected_files=[base_path/'train'/'qrels.tsv', base_path/'dev1'/'qrels.tsv', base_path/'dev2'/'qrels.tsv'], message='Updating qrels (task organizers removed duplicates)') subsets['train'] = Dataset( collection, TsvQueries(dlc['train/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['train/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev1'] = Dataset( collection, TsvQueries(dlc['dev1/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['dev1/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['dev1/scoreddocs'])), ) subsets['dev2'] = Dataset( collection, TsvQueries(dlc['dev2/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['dev2/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['dev2/scoreddocs'])), ) subsets['trec-dl-2021'] = Dataset( collection, TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])), ) dl21_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()}) subsets['trec-dl-2021/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged), FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged), subsets['trec-dl-2021'], ) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) migrator = Migrator(base_path / 'irds_version.txt', 'v2', affected_files=[ base_path / 'collection.tsv', base_path / 'collection.tsv.pklz4' ], message=f'Migrating {NAME} (fixing passage encoding)') collection = TsvDocs(Cache( FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path / 'collection.tsv'), namespace='msmarco', lang='en', docstore_size_hint=14373971970, count_hint=ir_datasets.util.count_hint(NAME)) collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path / 'train/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['train/qrels'], QRELS_DEFS), TsvDocPairs(GzipExtract(dlc['train/docpairs'])), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path / 'train/ms.run')), ) subsets['train/triples-v2'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])), subsets['train'].scoreddocs_handler(), ) subsets['train/triples-small'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs( Cache( MapSmallTriplesQidPid( TarExtract(dlc['train/docpairs/small'], 'triples.train.small.tsv'), TarExtract(dlc['collectionandqueries'], 'collection.tsv'), subsets['train'].queries_handler()), base_path / 'train/small.triples.qidpid.tsv')), subsets['train'].scoreddocs_handler(), ) subsets['dev'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path / 'dev/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['dev/qrels'], QRELS_DEFS), ) subsets['dev/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path / 'dev/small/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels( Cache( TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path / 'dev/small/qrels'), QRELS_DEFS), TrecScoredDocs( Cache( ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path / 'dev/ms.run')), ) subsets['eval'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path / 'eval/queries.tsv'), namespace='msmarco', lang='en'), ) subsets['eval/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path / 'eval/small/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path / 'eval/ms.run')), ) subsets['trec-dl-2019'] = Dataset( collection, TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path / 'trec-dl-2019/ms.run')), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path / 'trec-dl-2020/ms.run')), ) # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least # 1 relevance assessment train_judged = Lazy( lambda: {q.query_id for q in subsets['train'].qrels_iter()}) subsets['train/judged'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_judged), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged), subsets['train'], ) dev_judged = Lazy( lambda: {q.query_id for q in subsets['dev'].qrels_iter()}) subsets['dev/judged'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev_judged), subsets['dev'], ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) dl20_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged), FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged), subsets['trec-dl-2020'], ) # split200 -- 200 queries held out from the training data for validation split200 = Lazy(lambda: SPLIT200_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'), subsets['train'], ) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'), subsets['train'], ) # Medical subset def train_med(): with dlc['medmarco_ids'].stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} train_med = Lazy(train_med) subsets['train/medical'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_med), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med), FilteredDocPairs(subsets['train'].docpairs_handler(), train_med), FilteredQrels(subsets['train'].qrels_handler(), train_med), subsets['train'], ) # DL-Hard dl_hard_qrels_migrator = Migrator( base_path / 'trec-dl-hard' / 'irds_version.txt', 'v3', affected_files=[base_path / 'trec-dl-hard' / 'qrels'], message='Updating trec-dl-hard qrels') hard_qids = Lazy(lambda: DL_HARD_QIDS) dl_hard_base_queries = TsvQueries([ Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path / 'trec-dl-2020/queries.tsv') ], namespace='msmarco', lang='en') subsets['trec-dl-hard'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), dl_hard_qrels_migrator( TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)), documentation('trec-dl-hard')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1']) subsets['trec-dl-hard/fold1'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold1')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2']) subsets['trec-dl-hard/fold2'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold2')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3']) subsets['trec-dl-hard/fold3'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold3')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4']) subsets['trec-dl-hard/fold4'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold4')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5']) subsets['trec-dl-hard/fold5'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold5')) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): base_path = ir_datasets.util.home_path() / 'msmarco-document' documentation = YamlDocumentation('docs/msmarco-document.yaml') dlc = DownloadConfig.context('msmarco-document', base_path, dua=DUA) subsets = {} collection = MsMarcoTrecDocs(GzipExtract(dlc['docs'])) subsets['train'] = Dataset( collection, TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco'), TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev'] = Dataset( collection, TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco'), TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])), ) subsets['eval'] = Dataset( collection, TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco'), TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])), ) subsets['trec-dl-2019'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco'), TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco'), TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), ) subsets['orcas'] = Dataset( collection, TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas'), TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS), TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])), ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) ir_datasets.registry.register('msmarco-document', Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'msmarco-document/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = MsMarcoV2Docs(dlc['docs']) subsets['train'] = Dataset( collection, TsvQueries(dlc['train_queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['train_qrels'], QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['train_scoreddocs'])), ) subsets['dev1'] = Dataset( collection, TsvQueries(dlc['dev1_queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['dev1_qrels'], QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev1_scoreddocs'])), ) subsets['dev2'] = Dataset( collection, TsvQueries(dlc['dev2_queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['dev2_qrels'], QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev2_scoreddocs'])), ) subsets['trec-dl-2019'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['trec_dl_2019_qrels']), TREC_DL_QRELS_DEFS), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['trec_dl_2020_qrels']), TREC_DL_QRELS_DEFS), ) dl19_v2_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_v2_judged), subsets['trec-dl-2019'], ) dl20_v2_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_v2_judged), subsets['trec-dl-2020'], ) subsets['trec-dl-2021'] = Dataset( collection, TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])), ) dl21_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()}) subsets['trec-dl-2021/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged), FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged), subsets['trec-dl-2021'], ) subsets['anchor-text'] = Dataset( MsMarcoV2AnchorTextDocs(Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"), count_hint=4821244), documentation('anchor-text')) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} train_qrels = ir_datasets.registry['msmarco-passage/train'].qrels_handler() train_docparis = TsvDocPairs(dlc['train/triples']) dev_qrels = TrecQrels(dlc['dev/qrels'], QRELS_DEFS) dev_small_qrels = TrecQrels(dlc['dev/qrels-small'], QRELS_DEFS) small_dev_qids = Lazy( lambda: {q.query_id for q in dev_small_qrels.qrels_iter()}) for lang in ['es', 'fr', 'pt', 'it', 'id', 'de', 'ru', 'zh']: collection = TsvDocs( dlc[f'{lang}/docs'], namespace=f'mmarco/{lang}', lang=lang, count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}')) subsets[f'{lang}'] = Dataset(collection, documentation(f'{lang}')) subsets[f'{lang}/train'] = Dataset( collection, TsvQueries(dlc[f'{lang}/queries/train'], namespace=f'mmarco/{lang}', lang=lang), train_qrels, train_docparis, documentation(f'{lang}/train')) subsets[f'{lang}/dev'] = Dataset( collection, TsvQueries(dlc[f'{lang}/queries/dev'], namespace=f'mmarco/{lang}', lang=lang), dev_qrels, documentation(f'{lang}/dev')) subsets[f'{lang}/dev/small'] = Dataset( collection, FilteredQueries(subsets[f'{lang}/dev'].queries_handler(), small_dev_qids, mode='include'), dev_small_qrels, TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev']) if lang not in ('zh', 'pt') else None, documentation(f'{lang}/dev/small')) if lang in ('zh', 'pt'): subsets[f'{lang}/dev/v1.1'] = Dataset( collection, TsvQueries(dlc[f'{lang}/queries/dev/v1.1'], namespace=f'mmarco/{lang}', lang=lang), dev_qrels, documentation(f'{lang}/dev/v1.1')) subsets[f'{lang}/dev/small/v1.1'] = Dataset( collection, FilteredQueries(subsets[f'{lang}/dev/v1.1'].queries_handler(), small_dev_qids, mode='include'), dev_small_qrels, TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev/v1.1']), documentation(f'{lang}/dev/v1.1')) if lang in ('pt', ): subsets[f'{lang}/train/v1.1'] = Dataset( collection, TsvQueries(dlc[f'{lang}/queries/train/v1.1'], namespace=f'mmarco/{lang}', lang=lang), train_qrels, train_docparis, documentation(f'{lang}/train/v1.1')) for lang in [ 'ar', 'zh', 'dt', 'fr', 'de', 'hi', 'id', 'it', 'ja', 'pt', 'ru', 'es', 'vi' ]: collection = TsvDocs( dlc[f'v2/{lang}/docs'], namespace=f'mmarco/{lang}', lang=lang, count_hint=ir_datasets.util.count_hint(f'{NAME}/v2/{lang}')) subsets[f'v2/{lang}'] = Dataset(collection, documentation(f'v2/{lang}')) subsets[f'v2/{lang}/train'] = Dataset( collection, TsvQueries(dlc[f'v2/{lang}/queries/train'], namespace=f'mmarco/v2/{lang}', lang=lang), train_qrels, train_docparis, documentation(f'v2/{lang}/train')) subsets[f'v2/{lang}/dev'] = Dataset( collection, TsvQueries(dlc[f'v2/{lang}/queries/dev'], namespace=f'v2/mmarco/{lang}', lang=lang), dev_qrels, documentation(f'v2/{lang}/dev')) subsets[f'v2/{lang}/dev/small'] = Dataset( collection, FilteredQueries(subsets[f'v2/{lang}/dev'].queries_handler(), small_dev_qids, mode='include'), dev_small_qrels, TrecScoredDocs(dlc[f'v2/{lang}/scoreddocs/dev'], negate_score=True), documentation(f'v2/{lang}/dev/small')) ir_datasets.registry.register(NAME, Dataset(documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return collection, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = MsMarcoTrecDocs(GzipExtract(dlc['docs'])) subsets['train'] = Dataset( collection, TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev'] = Dataset( collection, TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco', lang='en'), TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])), ) subsets['eval'] = Dataset( collection, TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco', lang='en'), TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])), ) subsets['trec-dl-2019'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), ) subsets['orcas'] = Dataset( collection, TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas', lang='en'), TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS), TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])), ) dl19_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) dl20_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged), FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged), subsets['trec-dl-2020'], ) # DL-Hard dl_hard_qrels_migrator = Migrator(base_path/'trec-dl-hard'/'irds_version.txt', 'v2', affected_files=[base_path/'trec-dl-hard'/'qrels'], message='Updating trec-dl-hard qrels') hard_qids = Lazy(lambda: DL_HARD_QIDS) dl_hard_base_queries = TsvQueries([ Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path/'trec-dl-2019/queries.tsv'), Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path/'trec-dl-2020/queries.tsv')], namespace='msmarco', lang='en') subsets['trec-dl-hard'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), dl_hard_qrels_migrator(TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)), documentation('trec-dl-hard') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1']) subsets['trec-dl-hard/fold1'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold1') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2']) subsets['trec-dl-hard/fold2'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold2') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3']) subsets['trec-dl-hard/fold3'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold3') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4']) subsets['trec-dl-hard/fold4'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold4') ) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5']) subsets['trec-dl-hard/fold5'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold5') ) subsets['anchor-text'] = Dataset( MsMarcoAnchorTextDocs( Cache(GzipExtract(dlc['anchor-text']), base_path / "anchor-text.json"), count_hint=1703834 ), documentation('anchor-text') ) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation('docs/msmarco-passage.yaml') base_path = ir_datasets.util.home_path() / 'msmarco-passage' dlc = DownloadConfig.context('msmarco-passage', base_path, dua=DUA) collection = TsvDocs(Cache( FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path / 'collection.tsv'), namespace='msmarco') subsets = {} subsets['train'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path / 'train/queries.tsv'), namespace='msmarco'), TrecQrels(dlc['train/qrels'], QRELS_DEFS), TsvDocPairs(GzipExtract(dlc['train/docpairs'])), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path / 'train/ms.run')), ) subsets['dev'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path / 'dev/queries.tsv'), namespace='msmarco'), TrecQrels(dlc['dev/qrels'], QRELS_DEFS), TrecScoredDocs( Cache( ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path / 'dev/ms.run')), ) subsets['dev/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path / 'dev/small/queries.tsv'), namespace='msmarco'), TrecQrels( Cache( TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path / 'dev/small/qrels'), QRELS_DEFS), ) subsets['eval'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path / 'eval/queries.tsv'), namespace='msmarco'), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path / 'eval/ms.run')), ) subsets['eval/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path / 'eval/small/queries.tsv'), namespace='msmarco'), ) subsets['trec-dl-2019'] = Dataset( collection, TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), namespace='msmarco'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path / 'trec-dl-2019/ms.run')), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path / 'trec-dl-2020/ms.run')), ) # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least # 1 relevance assessment train_judged = Lazy( lambda: {q.query_id for q in subsets['train'].qrels_iter()}) subsets['train/judged'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_judged), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged), subsets['train'], ) dev_judged = Lazy( lambda: {q.query_id for q in subsets['dev'].qrels_iter()}) subsets['dev/judged'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev_judged), FilteredScoredDocs(subsets['dev'].scoreddocs_handler(), dev_judged), subsets['dev'], ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) # split200 -- 200 queries held out from the training data for validation split200 = Lazy(lambda: SPLIT200_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'), subsets['train'], ) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'), subsets['train'], ) # Medical subset def train_med(): with dlc['medmarco_ids'].stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} train_med = Lazy(train_med) subsets['train/medical'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_med), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med), FilteredDocPairs(subsets['train'].docpairs_handler(), train_med), FilteredQrels(subsets['train'].qrels_handler(), train_med), subsets['train'], ) ir_datasets.registry.register('msmarco-passage', Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'msmarco-passage/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') def wapo_converter(dsid): def wrapped(): BeautifulSoup = ir_datasets.lazy_libs.bs4().BeautifulSoup # NOTE: These rules are very specific in order to replicate the behaviour present in the official script # here: <https://github.com/grill-lab/trec-cast-tools/blob/8fa243a7e058ce4b1b378c99768c53546460c0fe/src/main/python/wapo_trecweb.py> # Specifically, things like skipping empty documents, filtering by "paragraph" subtype, and starting the # paragraph index at 1 are all needed to perfectly match the above script. # Note that the script does NOT strip HTML markup, which is meant to be removed out in a later stage (e.g., indexing). # We do that here for user simplicity, as it will allow the text to be consumed directly by various models # without the need for further pre-processing. (Though a bit of information is lost.) for wapo_doc in ir_datasets.load( dsid).docs_handler().docs_wapo_raw_iter(): doc_id = wapo_doc['id'] pid = itertools.count(1) # paragrah index starts at 1 for paragraph in wapo_doc['contents']: if paragraph is not None and paragraph.get( 'subtype' ) == 'paragraph' and paragraph['content'] != '': text = paragraph['content'] if paragraph.get('mime') == 'text/html': text = BeautifulSoup(f'<OUTER>{text}</OUTER>', 'lxml-xml').get_text() yield GenericDoc(f'WAPO_{doc_id}-{next(pid)}', text) return wrapped def prefixer(dsid, prefix): def wrapped(): for doc in ir_datasets.load(dsid).docs_iter(): yield GenericDoc(f'{prefix}_{doc.doc_id}', doc.text) return wrapped WAPO_v2 = wapo_converter('wapo/v2') MARCO = prefixer('msmarco-passage', 'MARCO') CAR = prefixer('car/v2.0', 'CAR') docs_v0 = CastDocs('docs_v0', [ ('WAPO', WAPO_v2, dlc['wapo_dupes']), ('MARCO', MARCO, dlc['marco_dupes']), ('CAR', CAR, None), ]) docs_v1 = CastDocs('docs_v1', [ ('MARCO', MARCO, dlc['marco_dupes']), ('CAR', CAR, None), ]) base = Dataset(documentation('_')) subsets['v0'] = Dataset(docs_v0) subsets['v0/train'] = Dataset( docs_v0, CastQueries(dlc['2019/train/queries'], Cast2019Query), TrecQrels(dlc['2019/train/qrels'], QRELS_DEFS_TRAIN), TrecScoredDocs(dlc['2019/train/scoreddocs'])) qids_train_v0 = Lazy( lambda: {q.query_id for q in subsets['v0/train'].qrels_iter()}) subsets['v0/train/judged'] = Dataset( docs_v0, FilteredQueries(subsets['v0/train'].queries_handler(), qids_train_v0), subsets['v0/train'].qrels_handler(), FilteredScoredDocs(subsets['v0/train'].scoreddocs_handler(), qids_train_v0), ) subsets['v1'] = Dataset(docs_v1) subsets['v1/2019'] = Dataset( docs_v1, CastQueries(dlc['2019/eval/queries'], Cast2019Query), TrecQrels(dlc['2019/eval/qrels'], QRELS_DEFS), TrecScoredDocs(dlc['2019/eval/scoreddocs'])) qids_2019 = Lazy( lambda: {q.query_id for q in subsets['v1/2019'].qrels_iter()}) subsets['v1/2019/judged'] = Dataset( docs_v1, FilteredQueries(subsets['v1/2019'].queries_handler(), qids_2019), subsets['v1/2019'].qrels_handler(), FilteredScoredDocs(subsets['v1/2019'].scoreddocs_handler(), qids_2019), ) subsets['v1/2020'] = Dataset( docs_v1, CastQueries(dlc['2020/queries'], Cast2020Query), TrecQrels(dlc['2020/qrels'], QRELS_DEFS), ) qids_2020 = Lazy( lambda: {q.query_id for q in subsets['v1/2020'].qrels_iter()}) subsets['v1/2020/judged'] = Dataset( docs_v1, FilteredQueries(subsets['v1/2020'].queries_handler(), qids_2020), subsets['v1/2020'].qrels_handler(), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return base, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base = Dataset(documentation('_')) subsets = {} benchmarks = { 'msmarco': (['train', 'dev', 'test'], GenericDoc, GenericQuery), 'trec-covid': (['test'], BeirCordDoc, BeirCovidQuery), 'nfcorpus': (['train', 'dev', 'test'], BeirTitleUrlDoc, BeirUrlQuery), 'nq': (['test'], BeirTitleDoc, GenericQuery), 'hotpotqa': (['train', 'dev', 'test'], BeirTitleUrlDoc, GenericQuery), 'fiqa': (['train', 'dev', 'test'], GenericDoc, GenericQuery), 'arguana': (['test'], BeirTitleDoc, GenericQuery), 'webis-touche2020': (['test'], BeirToucheDoc, BeirToucheQuery), 'webis-touche2020/v2': (['test'], BeirToucheDoc, BeirToucheQuery), 'quora': (['dev', 'test'], GenericDoc, GenericQuery), 'dbpedia-entity': (['dev', 'test'], BeirTitleUrlDoc, GenericQuery), 'scidocs': (['test'], BeirSciDoc, BeirSciQuery), 'fever': (['train', 'dev', 'test'], BeirTitleDoc, GenericQuery), 'climate-fever': (['test'], BeirTitleDoc, GenericQuery), 'scifact': (['train', 'test'], BeirTitleDoc, GenericQuery), } for ds, (qrels, doc_type, query_type) in benchmarks.items(): dlc_ds = dlc[ds] ds_zip = ds.split('/')[0] docs_migrator = Migrator( base_path / ds / 'irds_version.txt', 'v2', affected_files=[f'{base_path/ds}/docs.pklz4'], message=f'Migrating {NAME}/{ds} (structuring fields)') docs = docs_migrator( BeirDocs(ds, ZipExtract(dlc_ds, f'{ds_zip}/corpus.jsonl'), doc_type)) queries = BeirQueries( ds, Cache(ZipExtract(dlc_ds, f'{ds_zip}/queries.jsonl'), base_path / ds / 'queries.json'), query_type) if len(qrels) == 1: subsets[ds] = Dataset( docs, queries, BeirQrels(Cache( ZipExtract(dlc_ds, f'{ds_zip}/qrels/{qrels[0]}.tsv'), base_path / ds / f'{qrels[0]}.qrels'), qrels_defs={}), documentation(ds)) else: subsets[ds] = Dataset(docs, queries, documentation(ds)) for qrel in qrels: subset_qrels = BeirQrels(Cache( ZipExtract(dlc_ds, f'{ds_zip}/qrels/{qrel}.tsv'), base_path / ds / f'{qrel}.qrels'), qrels_defs={}) subset_qids = qid_filter(subset_qrels) subsets[f'{ds}/{qrel}'] = Dataset( docs, FilteredQueries(queries, subset_qids, mode='include'), subset_qrels, documentation(f'{ds}/{qrel}')) cqa = [ 'android', 'english', 'gaming', 'gis', 'mathematica', 'physics', 'programmers', 'stats', 'tex', 'unix', 'webmasters', 'wordpress' ] cqa_dlc = dlc['cqadupstack'] for ds in cqa: docs_migrator = Migrator( base_path / 'cqadupstack' / ds / 'irds_version.txt', 'v2', affected_files=[f'{base_path/"cqadupstack"/ds}/docs.pklz4'], message=f'Migrating {NAME}/cqadupstack/{ds} (structuring fields)') subsets[f'cqadupstack/{ds}'] = Dataset( docs_migrator( BeirDocs(f'cqadupstack/{ds}', ZipExtract(cqa_dlc, f'cqadupstack/{ds}/corpus.jsonl'), BeirCqaDoc)), BeirQueries( f'cqadupstack/{ds}', Cache(ZipExtract(cqa_dlc, f'cqadupstack/{ds}/queries.jsonl'), base_path / 'cqadupstack' / ds / 'queries.json'), BeirCqaQuery), BeirQrels(Cache( ZipExtract(cqa_dlc, f'cqadupstack/{ds}/qrels/test.tsv'), base_path / 'cqadupstack' / ds / f'test.qrels'), qrels_defs={}), documentation(f'cqadupstack/{ds}')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets