def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base = Dataset(documentation('_')) # dummy top level ds for lang in ['zh', 'fa', 'ru']: lang_docs = HC4Docs(dlc[f'{lang}/docs'], subset_lang=lang) subsets[lang] = Dataset( lang_docs, documentation(lang) ) for sep in ['train', 'dev', 'test']: subsets[f'{lang}/{sep}'] = Dataset( lang_docs, HC4Queries(dlc[f'{sep}/topics'], subset_lang=lang), TrecQrels(dlc[f'{lang}/{sep}/qrels'], QREL_DEFS), documentation(f'{lang}/{sep}'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') en_noclean_tr_collection = C4Docs( GzipExtract(dlc['en-noclean/sources']), TarExtractAll(dlc['en-noclean/checkpoints'], base_path / 'en.noclean.checkpoints'), base_path, source_name_filter=r'en\.noclean\.c4-train', filter_name='train') # exclude validation files (only include train) base = Dataset(documentation('_')) subsets['en-noclean-tr'] = Dataset(en_noclean_tr_collection, documentation('en-noclean-tr')) subsets['en-noclean-tr/trec-misinfo-2021'] = Dataset( en_noclean_tr_collection, TrecXmlQueries(dlc['trec-misinfo-2021/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace='trec-misinfo', lang='en'), documentation('en-noclean-tr/trec-misinfo-2021')) ir_datasets.registry.register(NAME, base) for subset in subsets: ir_datasets.registry.register(f'{NAME}/{subset}', subsets[subset]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = HighwireDocs(dlc, dlc['legalspans']) base = Dataset(collection, documentation('_')) subsets['trec-genomics-2006'] = Dataset( collection, TrecGenomicsQueries(dlc['trec-genomics-2006/queries']), HighwireQrels(dlc['trec-genomics-2006/qrels'], QREL_DEFS_06), documentation('trec-genomics-2006'), ) subsets['trec-genomics-2007'] = Dataset( collection, TrecGenomicsQueries(dlc['trec-genomics-2007/queries']), HighwireQrels(dlc['trec-genomics-2007/qrels'], QREL_DEFS_07), documentation('trec-genomics-2007'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs(dlc['docs'], encoding='utf8', path_globs=[ 'aquaint_comp/apw/*/*.gz', 'aquaint_comp/nyt/*/*.gz', 'aquaint_comp/xie/*/*.gz' ], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec-robust-2005'] = Dataset( TrecQueries(dlc['trec-robust-2005/queries'], qtype_map=QTYPE_MAP, namespace='trec-robust', lang='en'), TrecQrels(dlc['trec-robust-2005/qrels'], QREL_DEFS), collection, documentation('trec-robust-2005')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) manager = NqManager(dlc, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = DocstoreBackedDocs(manager.docs_store, docs_cls=NqPassageDoc, namespace=NAME, lang='en') base = Dataset( collection, documentation('_')) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(manager.file_ref('train.queries.tsv'), namespace=NAME, lang='en'), NqQrels(manager.file_ref('train.qrels.jsonl')), NqScoredDocs(manager.file_ref('train.scoreddocs.tsv')), documentation('train'), ) subsets['dev'] = Dataset( collection, TsvQueries(manager.file_ref('dev.queries.tsv'), namespace=NAME, lang='en'), NqQrels(manager.file_ref('dev.qrels.jsonl')), NqScoredDocs(manager.file_ref('dev.scoreddocs.tsv')), documentation('dev'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) collection = TrecDocs(dlc['docs'], encoding='GB18030', path_globs=['**/xinhua/x*', '**/peoples-daily/pd*'], namespace=NAME, lang='zh', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec5'] = Dataset( TrecQueries(GzipExtract(dlc['trec5/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages TrecQrels(GzipExtract(dlc['trec5/qrels']), QREL_DEFS), collection, documentation('trec5')) subsets['trec6'] = Dataset( TrecQueries(GzipExtract(dlc['trec6/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages TrecQrels(GzipExtract(dlc['trec6/qrels']), QREL_DEFS), collection, documentation('trec6')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = NytDocs(dlc['source']) base = Dataset(collection, documentation('_')) all_queries = NytQueries(collection) all_qrels = NytQrels(collection) match_qids = Lazy(lambda: VALID_IDS) subsets['train'] = Dataset( FilteredQueries(all_queries, match_qids, mode='exclude'), FilteredQrels(all_qrels, match_qids, mode='exclude'), collection, documentation('train')) subsets['valid'] = Dataset( FilteredQueries(all_queries, match_qids, mode='include'), FilteredQrels(all_qrels, match_qids, mode='include'), collection, documentation('valid')) ir_datasets.registry.register('nyt', base) for s in sorted(subsets): ir_datasets.registry.register(f'nyt/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = FairTrecDocs(GzipExtract(dlc["docs"]), GzipExtract(dlc["metadata"])) base = Dataset( collection, documentation('_')) subsets = {} train_topics = GzipExtract(dlc["train/topics"]) subsets['train'] = Dataset( collection, FairTrecQueries(train_topics, FairTrecQuery), FairTrecQrels(train_topics), documentation('train')) subsets['eval'] = Dataset( collection, FairTrecQueries(GzipExtract(dlc['eval/topics']), FairTrecEvalQuery), documentation('eval')) ir_datasets.registry.register(NAME, base) for s in subsets: ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = TrecDocs(dlc['docs'], path_globs=[ '**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*', '**/LATIMES/LA*' ], namespace=NAME, lang='en', expected_file_count=2295, count_hint=ir_datasets.util.count_hint(NAME)) queries = TrecQueries(GzipExtract(dlc['queries']), namespace=NAME, lang='en') qrels = TrecQrels(dlc['qrels'], QREL_DEFS) base = Dataset(collection, queries, qrels, documentation('_')) for fold in FOLDS: qid_filter = make_filter(fold) subsets[fold] = Dataset(FilteredQueries(queries, qid_filter), FilteredQrels(qrels, qid_filter), collection, documentation(fold)) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs( dlc['docs'], encoding='utf8', path_globs=['arabic_newswire_a/transcripts/*/*.sgm.gz'], namespace=NAME) base = Dataset(collection, documentation('_')) subsets['ar2001'] = Dataset( TrecQueries(dlc['ar2001/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME), TrecQrels(dlc['ar2001/qrels'], QREL_DEFS), collection, documentation('ar2001')) subsets['ar2002'] = Dataset( TrecQueries(dlc['ar2002/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME), TrecQrels(dlc['ar2002/qrels'], QREL_DEFS), collection, documentation('ar2002')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TsvDocs(GzipExtract(dlc['docs']), doc_cls=DprW100Doc, namespace=NAME, lang='en', skip_first_line=True, docstore_size_hint=12827215492, count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets = {} nq_dev_manager = DprW100Manager(GzipExtract(dlc['nq-dev']), base_path / 'nq-dev') subsets['natural-questions/dev'] = Dataset( collection, DprW100Queries(nq_dev_manager.file_ref('queries.tsv')), TrecQrels(nq_dev_manager.file_ref('qrels'), QREL_DEFS), documentation('natural-questions/dev')) nq_train_manager = DprW100Manager(GzipExtract(dlc['nq-train']), base_path / 'nq-train') subsets['natural-questions/train'] = Dataset( collection, DprW100Queries(nq_train_manager.file_ref('queries.tsv')), TrecQrels(nq_train_manager.file_ref('qrels'), QREL_DEFS), documentation('natural-questions/train')) tqa_dev_manager = DprW100Manager(GzipExtract(dlc['tqa-dev']), base_path / 'tqa-dev', passage_id_key='psg_id') subsets['trivia-qa/dev'] = Dataset( collection, DprW100Queries(tqa_dev_manager.file_ref('queries.tsv')), TrecQrels(tqa_dev_manager.file_ref('qrels'), QREL_DEFS), documentation('trivia-qa/dev')) tqa_train_manager = DprW100Manager(GzipExtract(dlc['tqa-train']), base_path / 'tqa-train', passage_id_key='psg_id') subsets['trivia-qa/train'] = Dataset( collection, DprW100Queries(tqa_train_manager.file_ref('queries.tsv')), TrecQrels(tqa_train_manager.file_ref('qrels'), QREL_DEFS), documentation('trivia-qa/train')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _initializer(dsid, args, dlc_context=None): docs_lang, queries_lang, split = args docs = _docs_initializer(docs_lang) components = [docs] if queries_lang: # queries & split are optional dlc = _dlc().context(dlc_context, base_path) dlc_key = f'queries/{queries_lang}_{docs_lang}/{split}' qrel_dlc = GzipExtract(dlc[dlc_key]) qrels = CLIRMatrixQrels(qrel_dlc, QRELS_DEFS) queries = CLIRMatrixQueries(qrel_dlc, queries_lang) components += [queries, qrels] result = Dataset(*components) result = Dataset(MetadataComponent(dsid, result, metadata), result) return result
def _init(): base_path = home_path() / NAME documentation = YamlDocumentation(f"docs/{NAME}.yaml") download_config = DownloadConfig.context(NAME, base_path) base = Dataset(documentation('_')) # Arguments that can be loaded from Zenodo. arguments: Dict[str, ArgsMeDocs] = { name: ArgsMeDocs(Cache(ZipExtract(download_config[name], zip_path), base_path / f"{name}.json"), namespace=f"{NAME}/{name}", language=language, count_hint=count_hint) for name, (count_hint, language, zip_path) in SUBSETS.items() } # Arguments that are combined versions of other subsets. combined_arguments: Dict[str, ArgsMeCombinedArguments] = { name: ArgsMeCombinedArguments( base_path / f"{name}.json", [arguments[subset_name] for subset_name in subset_names], namespace=f"{NAME}/{name}", language=language, count_hint=count_hint) for name, (subset_names, count_hint, language) in COMBINED_SUBSETS.items() } # Wrap in datasets with documentation. datasets = { name: Dataset(arguments, documentation(name)) for name, arguments in chain(arguments.items(), combined_arguments.items()) } # NOTE: the following datasets are defined in touche.py: # - argsme/1.0/touche-2020-task-1/uncorrected # - argsme/2020-04-01/touche-2020-task-1 # - argsme/2020-04-01/touche-2020-task-1/uncorrected # - argsme/2020-04-01/touche-2021-task-1 # Register datasets. registry.register(NAME, base) for name, arguments in datasets.items(): registry.register(f'{NAME}/{name}', arguments) return base, datasets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} langs = ['python', 'java', 'go', 'php', 'ruby', 'javascript'] dlcs = { lang: ZipExtractCache(dlc[lang], base_path / lang) for lang in langs } all_dlcs = [dlcs[lang] for lang in langs] base = Dataset( CodeSearchNetDocs(all_dlcs), documentation('_'), ) subsets['train'] = Dataset( CodeSearchNetDocs(all_dlcs), CodeSearchNetQueries(all_dlcs, 'train'), CodeSearchNetQrels(all_dlcs, 'train'), documentation('train'), ) subsets['valid'] = Dataset( CodeSearchNetDocs(all_dlcs), CodeSearchNetQueries(all_dlcs, 'valid'), CodeSearchNetQrels(all_dlcs, 'valid'), documentation('valid'), ) subsets['test'] = Dataset( CodeSearchNetDocs(all_dlcs), CodeSearchNetQueries(all_dlcs, 'test'), CodeSearchNetQrels(all_dlcs, 'test'), documentation('test'), ) challenge_queries = CodeSearchNetChallengeQueries(dlc['challenge/queries']) subsets['challenge'] = Dataset( CodeSearchNetDocs(all_dlcs), challenge_queries, CodeSearchNetChallengeQrels(dlc['challenge/qrels'], challenge_queries), documentation('challenge'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') subsets = {} sources = [ ('en1k', 'wikIR1k'), ('en59k', 'wikIR59k'), ('en78k', 'enwikIR'), ('ens78k', 'enwikIRS'), ('fr14k', 'FRwikIR14k'), ('es13k', 'ESwikIR13k'), ('it16k', 'ITwikIR16k'), ] for source, zip_dir_name in sources: source_dlc = ZipExtractCache(dlc[source], base_path / source) docs = CsvDocs( RelativePath(source_dlc, f"{zip_dir_name}/documents.csv"), namespace=source, lang=source[:2], count_hint=ir_datasets.util.count_hint(f'{NAME}/{source}'), docstore_path=ir_datasets.util.home_path() / NAME / f'{source}.pklz4') subsets[source] = Dataset(docs, documentation(source)) for split in ['training', 'validation', 'test']: subsets[f'{source}/{split}'] = Dataset( docs, CsvQueries(RelativePath(source_dlc, f"{zip_dir_name}/{split}/queries.csv"), lang=source[:2]), TrecQrels(RelativePath(source_dlc, f"{zip_dir_name}/{split}/qrels"), qrels_defs=QRELS_DEFS), TrecScoredDocs( RelativePath(source_dlc, f"{zip_dir_name}/{split}/BM25.res")), documentation(f'{source}/{split}')) base = Dataset(documentation('_')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path/'corpus.chk') b13_dlc = Bz2Extract(Cache(TarExtract(dlc['cw12b-info'], 'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'), base_path/'CreateClueWeb12B13Dataset.jar')) collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc) collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc)) base = Dataset(collection, documentation('_')) subsets['b13'] = Dataset(collection_b13, documentation('b13')) subsets['trec-web-2013'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace=NAME), TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2013')) subsets['trec-web-2014'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace=NAME), TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2014')) subsets['b13/ntcir-www-1'] = Dataset( collection_b13, TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'), base_path/'ntcir-www-1'/'queries.xml'), qtype=GenericQuery, qtype_map={'qid': 'query_id', 'content': 'text'}, namespace=NAME), NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-1')) subsets['b13/ntcir-www-2'] = Dataset( collection_b13, TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'), base_path/'ntcir-www-2'/'queries.xml'), qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME), NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-2')) subsets['b13/ntcir-www-3'] = Dataset( collection_b13, TrecXmlQueries(dlc['ntcir-www-3/queries'], qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME), documentation('ntcir-www-3')) subsets['b13/trec-misinfo-2019'] = Dataset( collection_b13, TrecXmlQueries(dlc['trec-misinfo-2019/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace=NAME), MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS), documentation('trec-misinfo-2019')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'nyt.tgz.pklz4'], message='Migrating nyt (extracting body text)') collection = migrator(NytDocs(dlc['source'])) base = Dataset(collection, documentation('_')) # core17 subsets['trec-core-2017'] = Dataset( TrecQueries(dlc['trec-core-2017/queries'], namespace='trec-core-2017', lang='en'), TrecQrels(dlc['trec-core-2017/qrels'], CORE_QREL_DEFS), collection, documentation('trec-core-2017')) # wksup all_queries = NytQueries(collection) all_qrels = NytQrels(collection) match_qids = Lazy(lambda: VALID_IDS) subsets['wksup'] = Dataset( all_queries, all_qrels, collection, documentation('wksup/train')) subsets['wksup/train'] = Dataset( FilteredQueries(all_queries, match_qids, mode='exclude'), FilteredQrels(all_qrels, match_qids, mode='exclude'), collection, documentation('wksup/train')) subsets['wksup/valid'] = Dataset( FilteredQueries(all_queries, match_qids, mode='include'), FilteredQrels(all_qrels, match_qids, mode='include'), collection, documentation('wksup/valid')) ir_datasets.registry.register('nyt', base) for s in sorted(subsets): ir_datasets.registry.register(f'nyt/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_dlc = TarExtractAll(dlc['source'], base_path / 'lotte_extracted') base = Dataset(documentation('_')) subsets = {} domains = [ ('lifestyle', ), ('recreation', ), ('science', ), ('technology', ), ('writing', ), ('pooled', ), ] for (domain, ) in domains: for split in ['dev', 'test']: corpus = TsvDocs(RelativePath( base_dlc, f'lotte/{domain}/{split}/collection.tsv'), lang='en') subsets[f'{domain}/{split}'] = Dataset( corpus, documentation(f'{domain}/{split}')) for qtype in ['search', 'forum']: subsets[f'{domain}/{split}/{qtype}'] = Dataset( corpus, TsvQueries(RelativePath( base_dlc, f'lotte/{domain}/{split}/questions.{qtype}.tsv'), lang='en'), LotteQrels( RelativePath( base_dlc, f'lotte/{domain}/{split}/qas.{qtype}.jsonl')), documentation(f'{domain}/{split}/{qtype}')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) documentation = YamlDocumentation(f'docs/{NAME}.yaml') manager = MsMarcoQnAManager(GzipExtract(dlc['train']), GzipExtract(dlc['dev']), GzipExtract(dlc['eval']), base_path) migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[ base_path/'docs.pklz4', base_path/'train.run', base_path/'train.qrels', base_path/'dev.run', base_path/'dev.qrels', base_path/'eval.run', ], message='Migrating msmarco-qna (correcting doc_ids)') collection = DocstoreBackedDocs(manager.docs_store, docs_cls=MsMarcoQnADoc, namespace=NAME, lang='en') collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(manager.file_ref('train.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'), migrator(TrecQrels(manager.file_ref('train.qrels'), QRELS_DEFS)), migrator(TrecScoredDocs(manager.file_ref('train.run'))), ) subsets['dev'] = Dataset( collection, TsvQueries(manager.file_ref('dev.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'), migrator(TrecQrels(manager.file_ref('dev.qrels'), QRELS_DEFS)), migrator(TrecScoredDocs(manager.file_ref('dev.run'))), ) subsets['eval'] = Dataset( collection, TsvQueries(manager.file_ref('eval.queries.tsv'), query_cls=MsMarcoQnAEvalQuery, namespace='msmarco', lang='en'), migrator(TrecScoredDocs(manager.file_ref('eval.run'))), ) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs( dlc['docs'], encoding='ISO-8859-1', path_globs=['**/afp_text/af*', '**/infosel_data/ism_*'], namespace=NAME, lang='es', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec3'] = Dataset( TrecSpanishTranslateQueries( TrecQueries(GzipExtract(dlc['trec3/queries']), qtype_map=QTYPE_MAP_3, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish3Query), TrecQrels(GzipExtract(dlc['trec3/qrels']), QREL_DEFS), collection, documentation('trec3')) subsets['trec4'] = Dataset( TrecSpanishTranslateQueries( TrecQueries(GzipExtract(dlc['trec4/queries']), qtype=TrecDescOnlyQuery, qtype_map=QTYPE_MAP_4, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish4Query), TrecQrels(GzipExtract(dlc['trec4/qrels']), QREL_DEFS), collection, documentation('trec4')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = Cord19Docs(dlc['docs/2020-07-16'], base_path/'2020-07-16', '2020-07-16') base = Dataset(collection, documentation('_')) subsets['trec-covid'] = Dataset( TrecXmlQueries(dlc['trec-covid/queries'], qtype_map={'query': 'title', 'question': 'description', 'narrative': 'narrative'}, namespace=NAME), TrecQrels(dlc['trec-covid/qrels'], QRELS_DEFS), collection, documentation('trec-covid')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection_v2 = WapoDocs(dlc['v2']) base = Dataset(documentation('_')) subsets['v2'] = Dataset(collection_v2, documentation('v2')) subsets['v2/trec-core-2018'] = Dataset( collection_v2, TrecQueries(dlc['trec-core-2018/queries'], namespace='trec-core-2018', lang='en', remove_tags=RM_TAGS), TrecQrels(dlc['trec-core-2018/qrels'], CORE_QREL_DEFS), documentation('v2/trec-core-2018')) subsets['v2/trec-news-2018'] = Dataset( collection_v2, TrecQueries(dlc['trec-news-2018/queries'], namespace='trec-news-2018', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2018/qrels'], BL_QREL_DEFS), documentation('v2/trec-news-2018')) subsets['v2/trec-news-2019'] = Dataset( collection_v2, TrecQueries(dlc['trec-news-2019/queries'], namespace='trec-news-2019', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2019/qrels'], BL_QREL_DEFS), documentation('v2/trec-news-2019')) subsets['v3/trec-news-2020'] = Dataset( TrecQueries(dlc['trec-news-2020/queries'], namespace='trec-news-2020', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2020/qrels'], BL_QREL_DEFS), documentation('v3/trec-news-2020')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = MedlineDocs([ GzipExtract(dlc['docs/a']), GzipExtract(dlc['docs/b']), GzipExtract(dlc['docs/c']), GzipExtract(dlc['docs/d']) ]) base = Dataset(collection, documentation('_')) subsets['trec-genomics-2004'] = Dataset( collection, TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics'), TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS), documentation('trec-genomics-2004'), ) subsets['trec-genomics-2005'] = Dataset( collection, TrecGenomicsQueries(dlc['trec-genomics-2005/queries']), TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS), documentation('trec-genomics-2005'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'msmarco_v2_passage.tar.pklz4'], message='Cleaning up pklz4 lookup structure in favor of ID-based lookups') collection = MsMarcoV2Passages(dlc['passages']) collection = migrator(collection) qrels_migrator = Migrator(base_path/'qrels_version.txt', 'v2', affected_files=[base_path/'train'/'qrels.tsv', base_path/'dev1'/'qrels.tsv', base_path/'dev2'/'qrels.tsv'], message='Updating qrels (task organizers removed duplicates)') subsets['train'] = Dataset( collection, TsvQueries(dlc['train/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['train/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev1'] = Dataset( collection, TsvQueries(dlc['dev1/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['dev1/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['dev1/scoreddocs'])), ) subsets['dev2'] = Dataset( collection, TsvQueries(dlc['dev2/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['dev2/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['dev2/scoreddocs'])), ) subsets['trec-dl-2021'] = Dataset( collection, TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])), ) dl21_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()}) subsets['trec-dl-2021/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged), FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged), subsets['trec-dl-2021'], ) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} base = Dataset(documentation('_')) collection04 = MedlineDocs('2004', [GzipExtract(dlc['2004/a']), GzipExtract(dlc['2004/b']), GzipExtract(dlc['2004/c']), GzipExtract(dlc['2004/d'])], count_hint=ir_datasets.util.count_hint(f'{NAME}/2004')) subsets['2004'] = Dataset(collection04, documentation('2004')) subsets['2004/trec-genomics-2004'] = Dataset( collection04, TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics', lang='en'), TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS), documentation('trec-genomics-2004'), ) subsets['2004/trec-genomics-2005'] = Dataset( collection04, TrecGenomicsQueries(dlc['trec-genomics-2005/queries']), TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS), documentation('trec-genomics-2005'), ) collection17 = ConcatDocs([ AacrAscoDocs(dlc['2017/aacr_asco_extra']), MedlineDocs('2017', [dlc['2017/part1'], dlc['2017/part2'], dlc['2017/part3'], dlc['2017/part4'], dlc['2017/part5']]), ], count_hint=ir_datasets.util.count_hint(f'{NAME}/2017')) subsets['2017'] = Dataset(collection17, documentation('2017')) subsets['2017/trec-pm-2017'] = Dataset( collection17, TrecXmlQueries(dlc['trec-pm-2017/queries'], qtype=TrecPm2017Query, namespace='trec-pm-2017', lang='en'), TrecQrels(dlc['trec-pm-2017/qrels'], QREL_DEFS), documentation('trec-pm-2017'), ) subsets['2017/trec-pm-2018'] = Dataset( collection17, TrecXmlQueries(dlc['trec-pm-2018/queries'], qtype=TrecPmQuery, namespace='trec-pm-2018', lang='en'), TrecQrels(dlc['trec-pm-2018/qrels'], QREL_DEFS), documentation('trec-pm-2018'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') docs_v15 = CarDocs(TarExtract(dlc['docs'], 'paragraphcorpus/paragraphcorpus.cbor', compression='xz')) base = Dataset(documentation('_')) subsets['v1.5'] = Dataset(docs_v15, documentation('v1.5')) subsets['v1.5/trec-y1'] = Dataset( docs_v15, CarQueries(TarExtract(dlc['trec-y1/queries'], 'benchmarkY1test.public/test.benchmarkY1test.cbor.outlines', compression='xz')),) subsets['v1.5/trec-y1/manual'] = Dataset( subsets['v1.5/trec-y1'], TrecQrels(TarExtract(dlc['trec-y1/qrels'], 'TREC_CAR_2017_qrels/manual.benchmarkY1test.cbor.hierarchical.qrels'), MANUAL_QRELS)) subsets['v1.5/trec-y1/auto'] = Dataset( subsets['v1.5/trec-y1'], TrecQrels(TarExtract(dlc['trec-y1/qrels'], 'TREC_CAR_2017_qrels/automatic.benchmarkY1test.cbor.hierarchical.qrels'), AUTO_QRELS)) subsets['v1.5/test200'] = Dataset( docs_v15, CarQueries(TarExtract(dlc['test200'], 'test200/train.test200.cbor.outlines', compression='xz')), TrecQrels(TarExtract(dlc['test200'], 'test200/train.test200.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS)) train_data = ReTar(dlc['train'], base_path/'train.smaller.tar.xz', ['train/train.fold?.cbor.outlines', 'train/train.fold?.cbor.hierarchical.qrels'], compression='xz') subsets['v1.5/train/fold0'] = Dataset( docs_v15, CarQueries(TarExtract(train_data, 'train/train.fold0.cbor.outlines', compression='xz')), TrecQrels(TarExtract(train_data, 'train/train.fold0.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS)) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} main_dlc = dlc['main'] base = Dataset( VaswaniDocs( Cache(TarExtract(main_dlc, 'doc-text'), base_path / 'docs.txt')), VaswaniQueries( Cache(TarExtract(main_dlc, 'query-text'), base_path / 'queries.txt')), VaswaniQrels( Cache(TarExtract(main_dlc, 'rlv-ass'), base_path / 'qrels.txt')), documentation('_'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = GovDocs(dlc['docs']) base = Dataset(collection, documentation('_')) subsets['trec-web-2002'] = Dataset( collection, TrecQueries(GzipExtract(dlc['trec-web-2002/queries']), namespace='gov/trec-web-2002', lang='en'), TrecQrels(GzipExtract(dlc['trec-web-2002/qrels']), QREL_DEFS), documentation('trec-web-2002') ) subsets['trec-web-2002/named-page'] = Dataset( collection, TrecQueries(GzipExtract(dlc['trec-web-2002/named-page/queries']), qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2002/named-page', lang='en'), TrecQrels(GzipExtract(dlc['trec-web-2002/named-page/qrels']), NAMED_PAGE_QREL_DEFS), documentation('trec-web-2002/named-page') ) subsets['trec-web-2003'] = Dataset( collection, TrecQueries(dlc['trec-web-2003/queries'], qtype=GovWeb02Query, qtype_map=WEB03_QTYPE_MAP, namespace='gov/trec-web-2003', lang='en'), TrecQrels(dlc['trec-web-2003/qrels'], QREL_DEFS), documentation('trec-web-2003') ) subsets['trec-web-2003/named-page'] = Dataset( collection, TrecQueries(dlc['trec-web-2003/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2003/named-page', lang='en'), TrecQrels(dlc['trec-web-2003/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-web-2003/named-page') ) subsets['trec-web-2004'] = Dataset( collection, TrecQueries(dlc['trec-web-2004/queries'], qtype=GenericQuery, qtype_map=WEB04_QTYPE_MAP, namespace='gov/trec-web-2004', lang='en'), TrecQrels(dlc['trec-web-2004/qrels'], QREL_DEFS), documentation('trec-web-2004') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} main_dlc = dlc['main'] base = Dataset( CranfieldDocs( Cache(TarExtract(main_dlc, 'cran.all.1400'), base_path / 'docs.txt')), CranfieldQueries( Cache(TarExtract(main_dlc, 'cran.qry'), base_path / 'queries.txt')), CranfieldQrels( Cache(TarExtract(main_dlc, 'cranqrel'), base_path / 'qrels.txt')), documentation('_'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) migrator = Migrator(base_path / 'irds_version.txt', 'v2', affected_files=[ base_path / 'collection.tsv', base_path / 'collection.tsv.pklz4' ], message=f'Migrating {NAME} (fixing passage encoding)') collection = TsvDocs(Cache( FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path / 'collection.tsv'), namespace='msmarco', lang='en', docstore_size_hint=14373971970, count_hint=ir_datasets.util.count_hint(NAME)) collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path / 'train/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['train/qrels'], QRELS_DEFS), TsvDocPairs(GzipExtract(dlc['train/docpairs'])), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path / 'train/ms.run')), ) subsets['train/triples-v2'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])), subsets['train'].scoreddocs_handler(), ) subsets['train/triples-small'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs( Cache( MapSmallTriplesQidPid( TarExtract(dlc['train/docpairs/small'], 'triples.train.small.tsv'), TarExtract(dlc['collectionandqueries'], 'collection.tsv'), subsets['train'].queries_handler()), base_path / 'train/small.triples.qidpid.tsv')), subsets['train'].scoreddocs_handler(), ) subsets['dev'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path / 'dev/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['dev/qrels'], QRELS_DEFS), ) subsets['dev/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path / 'dev/small/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels( Cache( TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path / 'dev/small/qrels'), QRELS_DEFS), TrecScoredDocs( Cache( ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path / 'dev/ms.run')), ) subsets['eval'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path / 'eval/queries.tsv'), namespace='msmarco', lang='en'), ) subsets['eval/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path / 'eval/small/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path / 'eval/ms.run')), ) subsets['trec-dl-2019'] = Dataset( collection, TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path / 'trec-dl-2019/ms.run')), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path / 'trec-dl-2020/ms.run')), ) # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least # 1 relevance assessment train_judged = Lazy( lambda: {q.query_id for q in subsets['train'].qrels_iter()}) subsets['train/judged'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_judged), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged), subsets['train'], ) dev_judged = Lazy( lambda: {q.query_id for q in subsets['dev'].qrels_iter()}) subsets['dev/judged'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev_judged), subsets['dev'], ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) dl20_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged), FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged), subsets['trec-dl-2020'], ) # split200 -- 200 queries held out from the training data for validation split200 = Lazy(lambda: SPLIT200_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'), subsets['train'], ) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'), subsets['train'], ) # Medical subset def train_med(): with dlc['medmarco_ids'].stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} train_med = Lazy(train_med) subsets['train/medical'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_med), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med), FilteredDocPairs(subsets['train'].docpairs_handler(), train_med), FilteredQrels(subsets['train'].qrels_handler(), train_med), subsets['train'], ) # DL-Hard dl_hard_qrels_migrator = Migrator( base_path / 'trec-dl-hard' / 'irds_version.txt', 'v3', affected_files=[base_path / 'trec-dl-hard' / 'qrels'], message='Updating trec-dl-hard qrels') hard_qids = Lazy(lambda: DL_HARD_QIDS) dl_hard_base_queries = TsvQueries([ Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path / 'trec-dl-2020/queries.tsv') ], namespace='msmarco', lang='en') subsets['trec-dl-hard'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), dl_hard_qrels_migrator( TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)), documentation('trec-dl-hard')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1']) subsets['trec-dl-hard/fold1'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold1')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2']) subsets['trec-dl-hard/fold2'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold2')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3']) subsets['trec-dl-hard/fold3'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold3')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4']) subsets['trec-dl-hard/fold4'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold4')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5']) subsets['trec-dl-hard/fold5'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold5')) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets