def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs( dlc['docs'], encoding='utf8', path_globs=['arabic_newswire_a/transcripts/*/*.sgm.gz'], namespace=NAME) base = Dataset(collection, documentation('_')) subsets['ar2001'] = Dataset( TrecQueries(dlc['ar2001/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME), TrecQrels(dlc['ar2001/qrels'], QREL_DEFS), collection, documentation('ar2001')) subsets['ar2002'] = Dataset( TrecQueries(dlc['ar2002/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME), TrecQrels(dlc['ar2002/qrels'], QREL_DEFS), collection, documentation('ar2002')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) collection = TrecDocs(dlc['docs'], encoding='GB18030', path_globs=['**/xinhua/x*', '**/peoples-daily/pd*'], namespace=NAME, lang='zh', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec5'] = Dataset( TrecQueries(GzipExtract(dlc['trec5/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages TrecQrels(GzipExtract(dlc['trec5/qrels']), QREL_DEFS), collection, documentation('trec5')) subsets['trec6'] = Dataset( TrecQueries(GzipExtract(dlc['trec6/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages TrecQrels(GzipExtract(dlc['trec6/qrels']), QREL_DEFS), collection, documentation('trec6')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection_v2 = WapoDocs(dlc['v2']) base = Dataset(documentation('_')) subsets['v2'] = Dataset(collection_v2, documentation('v2')) subsets['v2/trec-core-2018'] = Dataset( collection_v2, TrecQueries(dlc['trec-core-2018/queries'], namespace='trec-core-2018', lang='en', remove_tags=RM_TAGS), TrecQrels(dlc['trec-core-2018/qrels'], CORE_QREL_DEFS), documentation('v2/trec-core-2018')) subsets['v2/trec-news-2018'] = Dataset( collection_v2, TrecQueries(dlc['trec-news-2018/queries'], namespace='trec-news-2018', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2018/qrels'], BL_QREL_DEFS), documentation('v2/trec-news-2018')) subsets['v2/trec-news-2019'] = Dataset( collection_v2, TrecQueries(dlc['trec-news-2019/queries'], namespace='trec-news-2019', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2019/qrels'], BL_QREL_DEFS), documentation('v2/trec-news-2019')) subsets['v3/trec-news-2020'] = Dataset( TrecQueries(dlc['trec-news-2020/queries'], namespace='trec-news-2020', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS), TrecQrels(dlc['trec-news-2020/qrels'], BL_QREL_DEFS), documentation('v3/trec-news-2020')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path/'corpus.chk') b13_dlc = Bz2Extract(Cache(TarExtract(dlc['cw12b-info'], 'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'), base_path/'CreateClueWeb12B13Dataset.jar')) collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc) collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc)) base = Dataset(collection, documentation('_')) subsets['b13'] = Dataset(collection_b13, documentation('b13')) subsets['trec-web-2013'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace=NAME), TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2013')) subsets['trec-web-2014'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace=NAME), TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2014')) subsets['b13/ntcir-www-1'] = Dataset( collection_b13, TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'), base_path/'ntcir-www-1'/'queries.xml'), qtype=GenericQuery, qtype_map={'qid': 'query_id', 'content': 'text'}, namespace=NAME), NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-1')) subsets['b13/ntcir-www-2'] = Dataset( collection_b13, TrecXmlQueries(Cache(ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'), base_path/'ntcir-www-2'/'queries.xml'), qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME), NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-2')) subsets['b13/ntcir-www-3'] = Dataset( collection_b13, TrecXmlQueries(dlc['ntcir-www-3/queries'], qtype=NtcirQuery, qtype_map=ntcir_map, namespace=NAME), documentation('ntcir-www-3')) subsets['b13/trec-misinfo-2019'] = Dataset( collection_b13, TrecXmlQueries(dlc['trec-misinfo-2019/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace=NAME), MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS), documentation('trec-misinfo-2019')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TsvDocs(GzipExtract(dlc['docs']), doc_cls=DprW100Doc, namespace=NAME, lang='en', skip_first_line=True, docstore_size_hint=12827215492, count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets = {} nq_dev_manager = DprW100Manager(GzipExtract(dlc['nq-dev']), base_path / 'nq-dev') subsets['natural-questions/dev'] = Dataset( collection, DprW100Queries(nq_dev_manager.file_ref('queries.tsv')), TrecQrels(nq_dev_manager.file_ref('qrels'), QREL_DEFS), documentation('natural-questions/dev')) nq_train_manager = DprW100Manager(GzipExtract(dlc['nq-train']), base_path / 'nq-train') subsets['natural-questions/train'] = Dataset( collection, DprW100Queries(nq_train_manager.file_ref('queries.tsv')), TrecQrels(nq_train_manager.file_ref('qrels'), QREL_DEFS), documentation('natural-questions/train')) tqa_dev_manager = DprW100Manager(GzipExtract(dlc['tqa-dev']), base_path / 'tqa-dev', passage_id_key='psg_id') subsets['trivia-qa/dev'] = Dataset( collection, DprW100Queries(tqa_dev_manager.file_ref('queries.tsv')), TrecQrels(tqa_dev_manager.file_ref('qrels'), QREL_DEFS), documentation('trivia-qa/dev')) tqa_train_manager = DprW100Manager(GzipExtract(dlc['tqa-train']), base_path / 'tqa-train', passage_id_key='psg_id') subsets['trivia-qa/train'] = Dataset( collection, DprW100Queries(tqa_train_manager.file_ref('queries.tsv')), TrecQrels(tqa_train_manager.file_ref('qrels'), QREL_DEFS), documentation('trivia-qa/train')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'msmarco_v2_passage.tar.pklz4'], message='Cleaning up pklz4 lookup structure in favor of ID-based lookups') collection = MsMarcoV2Passages(dlc['passages']) collection = migrator(collection) qrels_migrator = Migrator(base_path/'qrels_version.txt', 'v2', affected_files=[base_path/'train'/'qrels.tsv', base_path/'dev1'/'qrels.tsv', base_path/'dev2'/'qrels.tsv'], message='Updating qrels (task organizers removed duplicates)') subsets['train'] = Dataset( collection, TsvQueries(dlc['train/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['train/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev1'] = Dataset( collection, TsvQueries(dlc['dev1/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['dev1/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['dev1/scoreddocs'])), ) subsets['dev2'] = Dataset( collection, TsvQueries(dlc['dev2/queries'], namespace='msmarco', lang='en'), qrels_migrator(TrecQrels(dlc['dev2/qrels'], QRELS_DEFS)), TrecScoredDocs(GzipExtract(dlc['dev2/scoreddocs'])), ) subsets['trec-dl-2021'] = Dataset( collection, TsvQueries(dlc['trec-dl-2021/queries'], namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2021/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2021/scoreddocs'])), ) dl21_judged = Lazy(lambda: {q.query_id for q in subsets['trec-dl-2021'].qrels_iter()}) subsets['trec-dl-2021/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2021'].queries_handler(), dl21_judged), FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged), subsets['trec-dl-2021'], ) ir_datasets.registry.register(NAME, Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} base = Dataset(documentation('_')) collection04 = MedlineDocs('2004', [GzipExtract(dlc['2004/a']), GzipExtract(dlc['2004/b']), GzipExtract(dlc['2004/c']), GzipExtract(dlc['2004/d'])], count_hint=ir_datasets.util.count_hint(f'{NAME}/2004')) subsets['2004'] = Dataset(collection04, documentation('2004')) subsets['2004/trec-genomics-2004'] = Dataset( collection04, TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics', lang='en'), TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS), documentation('trec-genomics-2004'), ) subsets['2004/trec-genomics-2005'] = Dataset( collection04, TrecGenomicsQueries(dlc['trec-genomics-2005/queries']), TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS), documentation('trec-genomics-2005'), ) collection17 = ConcatDocs([ AacrAscoDocs(dlc['2017/aacr_asco_extra']), MedlineDocs('2017', [dlc['2017/part1'], dlc['2017/part2'], dlc['2017/part3'], dlc['2017/part4'], dlc['2017/part5']]), ], count_hint=ir_datasets.util.count_hint(f'{NAME}/2017')) subsets['2017'] = Dataset(collection17, documentation('2017')) subsets['2017/trec-pm-2017'] = Dataset( collection17, TrecXmlQueries(dlc['trec-pm-2017/queries'], qtype=TrecPm2017Query, namespace='trec-pm-2017', lang='en'), TrecQrels(dlc['trec-pm-2017/qrels'], QREL_DEFS), documentation('trec-pm-2017'), ) subsets['2017/trec-pm-2018'] = Dataset( collection17, TrecXmlQueries(dlc['trec-pm-2018/queries'], qtype=TrecPmQuery, namespace='trec-pm-2018', lang='en'), TrecQrels(dlc['trec-pm-2018/qrels'], QREL_DEFS), documentation('trec-pm-2018'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) subsets = {} collection = TrecDocs(dlc['docs'], path_globs=[ '**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*', '**/LATIMES/LA*' ], namespace=NAME, lang='en', expected_file_count=2295, count_hint=ir_datasets.util.count_hint(NAME)) queries = TrecQueries(GzipExtract(dlc['queries']), namespace=NAME, lang='en') qrels = TrecQrels(dlc['qrels'], QREL_DEFS) base = Dataset(collection, queries, qrels, documentation('_')) for fold in FOLDS: qid_filter = make_filter(fold) subsets[fold] = Dataset(FilteredQueries(queries, qid_filter), FilteredQrels(qrels, qid_filter), collection, documentation(fold)) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base = Dataset(documentation('_')) # dummy top level ds for lang in ['zh', 'fa', 'ru']: lang_docs = HC4Docs(dlc[f'{lang}/docs'], subset_lang=lang) subsets[lang] = Dataset( lang_docs, documentation(lang) ) for sep in ['train', 'dev', 'test']: subsets[f'{lang}/{sep}'] = Dataset( lang_docs, HC4Queries(dlc[f'{sep}/topics'], subset_lang=lang), TrecQrels(dlc[f'{lang}/{sep}/qrels'], QREL_DEFS), documentation(f'{lang}/{sep}'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def test_qrels_bad_line(self): mock_file = StringFile(''' Q0 0 D1 3 Q0 1 D2 2 Q0 0\tD3 3 Q0 1 D2 1 BAD LINE Q1 0 D2 1 '''.lstrip()) QREL_DEFS = {} qrels = TrecQrels(mock_file, QREL_DEFS) with self.assertRaises(RuntimeError): list(qrels.qrels_iter())
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs(dlc['docs'], encoding='utf8', path_globs=[ 'aquaint_comp/apw/*/*.gz', 'aquaint_comp/nyt/*/*.gz', 'aquaint_comp/xie/*/*.gz' ], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec-robust-2005'] = Dataset( TrecQueries(dlc['trec-robust-2005/queries'], qtype_map=QTYPE_MAP, namespace='trec-robust', lang='en'), TrecQrels(dlc['trec-robust-2005/qrels'], QREL_DEFS), collection, documentation('trec-robust-2005')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation('docs/antique.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) collection = TsvDocs(dlc['docs'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) subsets = {} for subset in ('train', 'test'): qrels = TrecQrels(dlc[f'{subset}/qrels'], QREL_DEFS) queries = TsvQueries(dlc[f'{subset}/queries'], namespace=NAME, lang='en') subsets[subset] = Dataset(collection, queries, qrels) # Split the training data into training and validation data validation_qids = Lazy(lambda: VALIDATION_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), validation_qids, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), validation_qids, mode='exclude'), subsets['train']) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), validation_qids, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), validation_qids, mode='include'), subsets['train']) # Separate test set removing the "offensive (and noisy)" questions disallow_list = dlc['disallow_list'] def disllow_qids(): with disallow_list.stream() as stream: stream = io.TextIOWrapper(stream) return {l.rstrip() for l in stream} disllow_qids = Lazy(disllow_qids) subsets['test/non-offensive'] = Dataset( FilteredQueries(subsets['test'].queries_handler(), disllow_qids, mode='exclude'), FilteredQrels(subsets['test'].qrels_handler(), disllow_qids, mode='exclude'), subsets['test']) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = GovDocs(dlc['docs']) base = Dataset(collection, documentation('_')) subsets['trec-web-2002'] = Dataset( collection, TrecQueries(GzipExtract(dlc['trec-web-2002/queries']), namespace='gov/trec-web-2002', lang='en'), TrecQrels(GzipExtract(dlc['trec-web-2002/qrels']), QREL_DEFS), documentation('trec-web-2002') ) subsets['trec-web-2002/named-page'] = Dataset( collection, TrecQueries(GzipExtract(dlc['trec-web-2002/named-page/queries']), qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2002/named-page', lang='en'), TrecQrels(GzipExtract(dlc['trec-web-2002/named-page/qrels']), NAMED_PAGE_QREL_DEFS), documentation('trec-web-2002/named-page') ) subsets['trec-web-2003'] = Dataset( collection, TrecQueries(dlc['trec-web-2003/queries'], qtype=GovWeb02Query, qtype_map=WEB03_QTYPE_MAP, namespace='gov/trec-web-2003', lang='en'), TrecQrels(dlc['trec-web-2003/qrels'], QREL_DEFS), documentation('trec-web-2003') ) subsets['trec-web-2003/named-page'] = Dataset( collection, TrecQueries(dlc['trec-web-2003/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace='gov/trec-web-2003/named-page', lang='en'), TrecQrels(dlc['trec-web-2003/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-web-2003/named-page') ) subsets['trec-web-2004'] = Dataset( collection, TrecQueries(dlc['trec-web-2004/queries'], qtype=GenericQuery, qtype_map=WEB04_QTYPE_MAP, namespace='gov/trec-web-2004', lang='en'), TrecQrels(dlc['trec-web-2004/qrels'], QREL_DEFS), documentation('trec-web-2004') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) documentation = YamlDocumentation(f'docs/{NAME}.yaml') manager = MsMarcoQnAManager(GzipExtract(dlc['train']), GzipExtract(dlc['dev']), GzipExtract(dlc['eval']), base_path) migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[ base_path/'docs.pklz4', base_path/'train.run', base_path/'train.qrels', base_path/'dev.run', base_path/'dev.qrels', base_path/'eval.run', ], message='Migrating msmarco-qna (correcting doc_ids)') collection = DocstoreBackedDocs(manager.docs_store, docs_cls=MsMarcoQnADoc, namespace=NAME, lang='en') collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(manager.file_ref('train.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'), migrator(TrecQrels(manager.file_ref('train.qrels'), QRELS_DEFS)), migrator(TrecScoredDocs(manager.file_ref('train.run'))), ) subsets['dev'] = Dataset( collection, TsvQueries(manager.file_ref('dev.queries.tsv'), query_cls=MsMarcoQnAQuery, namespace='msmarco', lang='en'), migrator(TrecQrels(manager.file_ref('dev.qrels'), QRELS_DEFS)), migrator(TrecScoredDocs(manager.file_ref('dev.run'))), ) subsets['eval'] = Dataset( collection, TsvQueries(manager.file_ref('eval.queries.tsv'), query_cls=MsMarcoQnAEvalQuery, namespace='msmarco', lang='en'), migrator(TrecScoredDocs(manager.file_ref('eval.run'))), ) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs( dlc['docs'], encoding='ISO-8859-1', path_globs=['**/afp_text/af*', '**/infosel_data/ism_*'], namespace=NAME, lang='es', count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets['trec3'] = Dataset( TrecSpanishTranslateQueries( TrecQueries(GzipExtract(dlc['trec3/queries']), qtype_map=QTYPE_MAP_3, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish3Query), TrecQrels(GzipExtract(dlc['trec3/qrels']), QREL_DEFS), collection, documentation('trec3')) subsets['trec4'] = Dataset( TrecSpanishTranslateQueries( TrecQueries(GzipExtract(dlc['trec4/queries']), qtype=TrecDescOnlyQuery, qtype_map=QTYPE_MAP_4, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish4Query), TrecQrels(GzipExtract(dlc['trec4/qrels']), QREL_DEFS), collection, documentation('trec4')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def test_qrels(self): mock_file = StringFile(''' Q0 0 D1 3 Q0 1 D2 2 Q0 0\tD3 3 Q0 1 D2 1 Q1 0 D2 1 '''.lstrip()) QREL_DEFS = {} expected_results = [ TrecQrel('Q0', 'D1', 3, '0'), TrecQrel('Q0', 'D2', 2, '1'), TrecQrel('Q0', 'D3', 3, '0'), TrecQrel('Q0', 'D2', 1, '1'), TrecQrel('Q1', 'D2', 1, '0'), ] qrels = TrecQrels(mock_file, QREL_DEFS) self.assertEqual(qrels.qrels_path(), 'MOCK') self.assertEqual(qrels.qrels_defs(), QREL_DEFS) self.assertEqual(list(qrels.qrels_iter()), expected_results)
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') docs_v15 = CarDocs(TarExtract(dlc['docs'], 'paragraphcorpus/paragraphcorpus.cbor', compression='xz')) base = Dataset(documentation('_')) subsets['v1.5'] = Dataset(docs_v15, documentation('v1.5')) subsets['v1.5/trec-y1'] = Dataset( docs_v15, CarQueries(TarExtract(dlc['trec-y1/queries'], 'benchmarkY1test.public/test.benchmarkY1test.cbor.outlines', compression='xz')),) subsets['v1.5/trec-y1/manual'] = Dataset( subsets['v1.5/trec-y1'], TrecQrels(TarExtract(dlc['trec-y1/qrels'], 'TREC_CAR_2017_qrels/manual.benchmarkY1test.cbor.hierarchical.qrels'), MANUAL_QRELS)) subsets['v1.5/trec-y1/auto'] = Dataset( subsets['v1.5/trec-y1'], TrecQrels(TarExtract(dlc['trec-y1/qrels'], 'TREC_CAR_2017_qrels/automatic.benchmarkY1test.cbor.hierarchical.qrels'), AUTO_QRELS)) subsets['v1.5/test200'] = Dataset( docs_v15, CarQueries(TarExtract(dlc['test200'], 'test200/train.test200.cbor.outlines', compression='xz')), TrecQrels(TarExtract(dlc['test200'], 'test200/train.test200.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS)) train_data = ReTar(dlc['train'], base_path/'train.smaller.tar.xz', ['train/train.fold?.cbor.outlines', 'train/train.fold?.cbor.hierarchical.qrels'], compression='xz') subsets['v1.5/train/fold0'] = Dataset( docs_v15, CarQueries(TarExtract(train_data, 'train/train.fold0.cbor.outlines', compression='xz')), TrecQrels(TarExtract(train_data, 'train/train.fold0.cbor.hierarchical.qrels', compression='xz'), AUTO_QRELS)) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} collection = MedlineDocs([ GzipExtract(dlc['docs/a']), GzipExtract(dlc['docs/b']), GzipExtract(dlc['docs/c']), GzipExtract(dlc['docs/d']) ]) base = Dataset(collection, documentation('_')) subsets['trec-genomics-2004'] = Dataset( collection, TrecXmlQueries(ZipExtract(dlc['trec-genomics-2004/queries'], 'Official.xml'), qtype=TrecGenomicsQuery, qtype_map=TREC04_XML_MAP, namespace='trec-genomics'), TrecQrels(dlc['trec-genomics-2004/qrels'], QREL_DEFS), documentation('trec-genomics-2004'), ) subsets['trec-genomics-2005'] = Dataset( collection, TrecGenomicsQueries(dlc['trec-genomics-2005/queries']), TrecQrels(dlc['trec-genomics-2005/qrels'], QREL_DEFS), documentation('trec-genomics-2005'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') subsets = {} sources = [ ('en1k', 'wikIR1k'), ('en59k', 'wikIR59k'), ('en78k', 'enwikIR'), ('ens78k', 'enwikIRS'), ('fr14k', 'FRwikIR14k'), ('es13k', 'ESwikIR13k'), ('it16k', 'ITwikIR16k'), ] for source, zip_dir_name in sources: source_dlc = ZipExtractCache(dlc[source], base_path / source) docs = CsvDocs( RelativePath(source_dlc, f"{zip_dir_name}/documents.csv"), namespace=source, lang=source[:2], count_hint=ir_datasets.util.count_hint(f'{NAME}/{source}'), docstore_path=ir_datasets.util.home_path() / NAME / f'{source}.pklz4') subsets[source] = Dataset(docs, documentation(source)) for split in ['training', 'validation', 'test']: subsets[f'{source}/{split}'] = Dataset( docs, CsvQueries(RelativePath(source_dlc, f"{zip_dir_name}/{split}/queries.csv"), lang=source[:2]), TrecQrels(RelativePath(source_dlc, f"{zip_dir_name}/{split}/qrels"), qrels_defs=QRELS_DEFS), TrecScoredDocs( RelativePath(source_dlc, f"{zip_dir_name}/{split}/BM25.res")), documentation(f'{source}/{split}')) base = Dataset(documentation('_')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') migrator = Migrator(base_path/'irds_version.txt', 'v2', affected_files=[base_path/'nyt.tgz.pklz4'], message='Migrating nyt (extracting body text)') collection = migrator(NytDocs(dlc['source'])) base = Dataset(collection, documentation('_')) # core17 subsets['trec-core-2017'] = Dataset( TrecQueries(dlc['trec-core-2017/queries'], namespace='trec-core-2017', lang='en'), TrecQrels(dlc['trec-core-2017/qrels'], CORE_QREL_DEFS), collection, documentation('trec-core-2017')) # wksup all_queries = NytQueries(collection) all_qrels = NytQrels(collection) match_qids = Lazy(lambda: VALID_IDS) subsets['wksup'] = Dataset( all_queries, all_qrels, collection, documentation('wksup/train')) subsets['wksup/train'] = Dataset( FilteredQueries(all_queries, match_qids, mode='exclude'), FilteredQrels(all_qrels, match_qids, mode='exclude'), collection, documentation('wksup/train')) subsets['wksup/valid'] = Dataset( FilteredQueries(all_queries, match_qids, mode='include'), FilteredQrels(all_qrels, match_qids, mode='include'), collection, documentation('wksup/valid')) ir_datasets.registry.register('nyt', base) for s in sorted(subsets): ir_datasets.registry.register(f'nyt/{s}', subsets[s]) return base, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = Cord19Docs(dlc['docs/2020-07-16'], base_path/'2020-07-16', '2020-07-16') base = Dataset(collection, documentation('_')) subsets['trec-covid'] = Dataset( TrecXmlQueries(dlc['trec-covid/queries'], qtype_map={'query': 'title', 'question': 'description', 'narrative': 'narrative'}, namespace=NAME), TrecQrels(dlc['trec-covid/qrels'], QRELS_DEFS), collection, documentation('trec-covid')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] chk_dlc = TarExtractAll(dlc['docs.chk'], base_path / 'corpus.chk') collection = ClueWeb09Docs(docs_dlc, chk_dlc, lang=None) # multiple langs collection_ar = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Arabic_1'], lang='ar') collection_zh = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=[ 'ClueWeb09_Chinese_1', 'ClueWeb09_Chinese_2', 'ClueWeb09_Chinese_3', 'ClueWeb09_Chinese_4' ], lang='zh') collection_en = ClueWeb09Docs( docs_dlc, chk_dlc, dirs=[ 'ClueWeb09_English_1', 'ClueWeb09_English_2', 'ClueWeb09_English_3', 'ClueWeb09_English_4', 'ClueWeb09_English_5', 'ClueWeb09_English_6', 'ClueWeb09_English_7', 'ClueWeb09_English_8', 'ClueWeb09_English_9', 'ClueWeb09_English_10' ], lang='en') collection_fr = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_French_1'], lang='fr') collection_de = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_German_1'], lang='de') collection_it = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Italian_1'], lang='it') collection_ja = ClueWeb09Docs( docs_dlc, chk_dlc, dirs=['ClueWeb09_Japanese_1', 'ClueWeb09_Japanese_2'], lang='ja') collection_ko = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Korean_1'], lang='ko') collection_pt = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_Portuguese_1'], lang='pt') collection_es = ClueWeb09Docs( docs_dlc, chk_dlc, dirs=['ClueWeb09_Spanish_1', 'ClueWeb09_Spanish_2'], lang='es') collection_catb = ClueWeb09Docs(docs_dlc, chk_dlc, dirs=['ClueWeb09_English_1'], lang='en') base = Dataset(collection, documentation('_')) subsets['ar'] = Dataset(collection_ar, documentation('ar')) subsets['zh'] = Dataset(collection_zh, documentation('zh')) subsets['en'] = Dataset(collection_en, documentation('en')) subsets['fr'] = Dataset(collection_fr, documentation('fr')) subsets['de'] = Dataset(collection_de, documentation('de')) subsets['it'] = Dataset(collection_it, documentation('it')) subsets['ja'] = Dataset(collection_ja, documentation('ja')) subsets['ko'] = Dataset(collection_ko, documentation('ko')) subsets['pt'] = Dataset(collection_pt, documentation('pt')) subsets['es'] = Dataset(collection_es, documentation('es')) subsets['catb'] = Dataset(collection_catb, documentation('catb')) subsets['en/trec-web-2009'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09), documentation('trec-web-2009')) subsets['en/trec-web-2010'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2010')) subsets['en/trec-web-2011'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2011')) subsets['en/trec-web-2012'] = Dataset( collection_en, TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2012')) subsets['catb/trec-web-2009'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2009/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter( TrecPrels(GzipExtract(dlc['trec-web-2009/qrels.adhoc']), QREL_DEFS_09)), documentation('trec-web-2009')) subsets['catb/trec-web-2010'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2010/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecQrels(dlc['trec-web-2010/qrels.adhoc'], QREL_DEFS)), documentation('trec-web-2010')) subsets['catb/trec-web-2011'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2011/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecQrels(dlc['trec-web-2011/qrels.adhoc'], QREL_DEFS)), documentation('trec-web-2011')) subsets['catb/trec-web-2012'] = Dataset( collection_catb, TrecXmlQueries(dlc['trec-web-2012/queries'], qtype=TrecWebTrackQuery, namespace=NAME, lang='en'), CatBQrelFilter(TrecQrels(dlc['trec-web-2012/qrels.adhoc'], QREL_DEFS)), documentation('trec-web-2012')) subsets['trec-mq-2009'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-mq-2009/queries']), encoding='latin1', lang='en'), TrecPrels(GzipExtract(dlc['trec-mq-2009/qrels']), QREL_DEFS_09), documentation('trec-mq-2009')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
# What to the relevance levels in qrels mean? QREL_DEFS = { 1: 'relevant', 0: 'not relevant', } # Specify where to find the content. Here it's just from the repository, but it could be anywhere. DL_DOCS = ir_datasets.util.RequestsDownload( 'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/docs.tsv' ) DL_QUERIES = ir_datasets.util.RequestsDownload( 'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/queries.tsv' ) DL_QRELS = ir_datasets.util.RequestsDownload( 'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/qrels' ) # where the content is cached base_path = ir_datasets.util.home_path() / NAME # Dataset definition: it provides docs, queries, and qrels dataset = ir_datasets.Dataset( TsvDocs(ir_datasets.util.Cache(DL_DOCS, base_path / 'docs.tsv')), TsvQueries(ir_datasets.util.Cache(DL_QUERIES, base_path / 'queries.tsv')), TrecQrels(ir_datasets.util.Cache(DL_QRELS, base_path / 'qrels'), QREL_DEFS), ) # Register the dataset with ir_datasets ir_datasets.registry.register(NAME, dataset)
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') main_dlc = dlc['main'] collection = TsvDocs(Cache( TarExtract(main_dlc, 'nfcorpus/raw/doc_dump.txt'), base_path / 'collection.tsv'), doc_cls=NfCorpusDoc, namespace=NAME) subsets = {} def read_lines(file): file = Cache(TarExtract(main_dlc, f'nfcorpus/raw/{file}'), base_path / file) with file.stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} nontopic_qid_filter = Lazy(lambda: read_lines('nontopics.ids')) video_qid_filter = Lazy(lambda: read_lines('all_videos.ids')) subsets['train'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.titles.queries'), base_path / 'train/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.all.queries'), base_path / 'train/queries.all.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels( Cache(TarExtract(main_dlc, 'nfcorpus/train.3-2-1.qrel'), base_path / 'train/qrels'), QRELS_DEFS), documentation('train'), ) subsets['train/nontopic'] = Dataset( collection, TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'), base_path / 'train/nontopic/queries.tsv'), namespace=NAME), FilteredQrels(subsets['train'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('train/nontopic'), ) subsets['train/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.vid-titles.queries'), base_path / 'train/video/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.vid-desc.queries'), base_path / 'train/video/queries.desc.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'), base_path / 'train/video/queries.tsv'), NfCorpusVideoQuery, namespace=NAME), FilteredQrels(subsets['train'].qrels_handler(), video_qid_filter, mode='include'), documentation('train/video'), ) subsets['dev'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.titles.queries'), base_path / 'dev/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.all.queries'), base_path / 'dev/queries.all.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels( Cache(TarExtract(main_dlc, 'nfcorpus/dev.3-2-1.qrel'), base_path / 'dev/qrels'), QRELS_DEFS), documentation('dev'), ) subsets['dev/nontopic'] = Dataset( collection, TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'), base_path / 'dev/nontopic/queries.tsv'), namespace=NAME), FilteredQrels(subsets['dev'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('dev/nontopic'), ) subsets['dev/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.vid-titles.queries'), base_path / 'dev/video/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.vid-desc.queries'), base_path / 'dev/video/queries.desc.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'), base_path / 'dev/video/queries.tsv'), NfCorpusVideoQuery, namespace=NAME), FilteredQrels(subsets['dev'].qrels_handler(), video_qid_filter, mode='include'), documentation('dev/video'), ) subsets['test'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.titles.queries'), base_path / 'test/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.all.queries'), base_path / 'test/queries.all.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels( Cache(TarExtract(main_dlc, 'nfcorpus/test.3-2-1.qrel'), base_path / 'test/qrels'), QRELS_DEFS), documentation('test'), ) subsets['test/nontopic'] = Dataset( collection, TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'), base_path / 'test/nontopic/queries.tsv'), namespace=NAME), FilteredQrels(subsets['test'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('test/nontopic'), ) subsets['test/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.vid-titles.queries'), base_path / 'test/video/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.vid-desc.queries'), base_path / 'test/video/queries.desc.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'), base_path / 'test/video/queries.tsv'), NfCorpusVideoQuery, namespace=NAME), FilteredQrels(subsets['test'].qrels_handler(), video_qid_filter, mode='include'), documentation('test/video'), ) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return collection, subsets
def _init(): base_path = ir_datasets.util.home_path() / 'msmarco-document' documentation = YamlDocumentation('docs/msmarco-document.yaml') dlc = DownloadConfig.context('msmarco-document', base_path, dua=DUA) subsets = {} collection = MsMarcoTrecDocs(GzipExtract(dlc['docs'])) subsets['train'] = Dataset( collection, TsvQueries(GzipExtract(dlc['train/queries']), namespace='msmarco'), TrecQrels(GzipExtract(dlc['train/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['train/scoreddocs'])), ) subsets['dev'] = Dataset( collection, TsvQueries(GzipExtract(dlc['dev/queries']), namespace='msmarco'), TrecQrels(GzipExtract(dlc['dev/qrels']), QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['dev/scoreddocs'])), ) subsets['eval'] = Dataset( collection, TsvQueries(GzipExtract(dlc['eval/queries']), namespace='msmarco'), TrecScoredDocs(GzipExtract(dlc['eval/scoreddocs'])), ) subsets['trec-dl-2019'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2019/queries']), namespace='msmarco'), TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco'), TrecScoredDocs(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), ) subsets['orcas'] = Dataset( collection, TsvQueries(GzipExtract(dlc['orcas/queries']), namespace='orcas'), TrecQrels(GzipExtract(dlc['orcas/qrels']), ORCAS_QLRES_DEFS), TrecScoredDocs(GzipExtract(dlc['orcas/scoreddocs'])), ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) ir_datasets.registry.register('msmarco-document', Dataset(collection, documentation("_"))) for s in sorted(subsets): ir_datasets.registry.register(f'msmarco-document/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path()/NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] doccount_dlc = Gov2DocCountFile(os.path.join(base_path, 'corpus.doccounts'), docs_dlc) collection = Gov2Docs(docs_dlc, doccount_dlc) base = Dataset(collection, documentation('_')) subsets['trec-tb-2004'] = Dataset( collection, TrecQueries(dlc['trec-tb-2004/queries'], namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2004/qrels'], QREL_DEFS), documentation('trec-tb-2004') ) subsets['trec-tb-2005'] = Dataset( collection, TrecQueries(dlc['trec-tb-2005/queries'], namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS), documentation('trec-tb-2005') ) subsets['trec-tb-2005/named-page'] = Dataset( collection, TrecQueries(dlc['trec-tb-2005/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2005/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-tb-2005/named-page') ) subsets['trec-tb-2005/efficiency'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-tb-2005/efficiency/queries']), encoding='latin1', namespace=NAME, lang='en'), RewriteQids(TrecQrels(dlc['trec-tb-2005/qrels'], QREL_DEFS), EFF_MAP_05), documentation('trec-tb-2005/efficiency') ) subsets['trec-tb-2006'] = Dataset( collection, TrecQueries(dlc['trec-tb-2006/queries'], namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), documentation('trec-tb-2006') ) subsets['trec-tb-2006/named-page'] = Dataset( collection, TrecQueries(dlc['trec-tb-2006/named-page/queries'], qtype=GenericQuery, qtype_map=NAMED_PAGE_QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels(dlc['trec-tb-2006/named-page/qrels'], NAMED_PAGE_QREL_DEFS), documentation('trec-tb-2006/named-page') ) subsets['trec-tb-2006/efficiency'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.all'), encoding='latin1', namespace=NAME, lang='en'), RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06), documentation('trec-tb-2006/efficiency') ) subsets['trec-tb-2006/efficiency/10k'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.10k'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/10k') ) subsets['trec-tb-2006/efficiency/stream1'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-1'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/stream1') ) subsets['trec-tb-2006/efficiency/stream2'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-2'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/stream2') ) subsets['trec-tb-2006/efficiency/stream3'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-3'), encoding='latin1', namespace=NAME, lang='en'), RewriteQids(TrecQrels(dlc['trec-tb-2006/qrels'], QREL_DEFS), EFF_MAP_06), documentation('trec-tb-2006/efficiency/stream3') ) subsets['trec-tb-2006/efficiency/stream4'] = Dataset( collection, TrecColonQueries(TarExtract(dlc['trec-tb-2006/efficiency/queries'], '06.efficiency_topics.stream-4'), encoding='latin1', namespace=NAME, lang='en'), documentation('trec-tb-2006/efficiency/stream4') ) subsets['trec-mq-2007'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-mq-2007/queries']), encoding='latin1'), TrecPrels(dlc['trec-mq-2007/qrels'], QREL_DEFS), documentation('trec-mq-2007') ) subsets['trec-mq-2008'] = Dataset( collection, TrecColonQueries(GzipExtract(dlc['trec-mq-2008/queries']), encoding='latin1', namespace='trec-mq', lang='en'), TrecPrels(TarExtract(dlc['trec-mq-2008/qrels'], '2008.RC1/prels'), QREL_DEFS), documentation('trec-mq-2008') ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} docs_dlc = dlc['docs'] docs_chk_dlc = TarExtractAll(dlc['docs.chk'], base_path / 'corpus.chk') b13_dlc = Bz2Extract( Cache( TarExtract( dlc['cw12b-info'], 'ClueWeb12-CreateB13/software/CreateClueWeb12B13Dataset.jar'), base_path / 'CreateClueWeb12B13Dataset.jar')) collection = ClueWeb12Docs(docs_dlc, docs_chk_dlc) collection_b13 = ClueWeb12Docs(ClueWeb12b13Extractor(docs_dlc, b13_dlc)) base = Dataset(collection, documentation('_')) subsets['b13'] = Dataset(collection_b13, documentation('b13')) subsets['trec-web-2013'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2013/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'), TrecQrels(dlc['trec-web-2013/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2013')) subsets['trec-web-2014'] = Dataset( collection, TrecXmlQueries(dlc['trec-web-2014/queries'], qtype=TrecWebTrackQuery, namespace='trec-web', lang='en'), TrecQrels(dlc['trec-web-2014/qrels.adhoc'], QREL_DEFS), documentation('trec-web-2014')) subsets['b13/ntcir-www-1'] = Dataset( collection_b13, TrecXmlQueries(Cache( ZipExtract(dlc['ntcir-www-1/queries'], 'eng.queries.xml'), base_path / 'ntcir-www-1' / 'queries.xml'), qtype=GenericQuery, qtype_map={ 'qid': 'query_id', 'content': 'text' }, namespace='ntcir-www', lang='en'), NtcirQrels(dlc['ntcir-www-1/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-1')) subsets['b13/ntcir-www-2'] = Dataset( collection_b13, TrecXmlQueries(Cache( ZipExtract(dlc['ntcir-www-2/queries'], 'qEng.xml'), base_path / 'ntcir-www-2' / 'queries.xml'), qtype=NtcirQuery, qtype_map=ntcir_map, namespace='ntcir-www', lang='en'), NtcirQrels(dlc['ntcir-www-2/qrels'], NTCIR_QREL_DEFS), documentation('ntcir-www-2')) subsets['b13/ntcir-www-3'] = Dataset( collection_b13, TrecXmlQueries(dlc['ntcir-www-3/queries'], qtype=NtcirQuery, qtype_map=ntcir_map, namespace='ntcir-www', lang='en'), documentation('ntcir-www-3')) subsets['b13/trec-misinfo-2019'] = Dataset( collection_b13, TrecXmlQueries(dlc['trec-misinfo-2019/queries'], qtype=MisinfoQuery, qtype_map=misinfo_map, namespace='trec-misinfo-2019', lang='en'), MsinfoQrels(dlc['trec-misinfo-2019/qrels'], MISINFO_QREL_DEFS), documentation('trec-misinfo-2019')) subsets['b13/clef-ehealth'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='en'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS), documentation('clef-ehealth')) subsets['b13/clef-ehealth/cs'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/cs']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='cs'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-cs'), documentation('clef-ehealth/cs')) subsets['b13/clef-ehealth/de'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/de']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='de'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-de'), documentation('clef-ehealth/de')) subsets['b13/clef-ehealth/fr'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/fr']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='fr'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-fr'), documentation('clef-ehealth/fr')) subsets['b13/clef-ehealth/hu'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/hu']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='hu'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-hu'), documentation('clef-ehealth/hu')) subsets['b13/clef-ehealth/pl'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/pl']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='pl'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-pl'), documentation('clef-ehealth/pl')) subsets['b13/clef-ehealth/sv'] = Dataset( collection_b13, TrecXmlQueries(FixAmp(dlc['clef-ehealth/queries/sv']), qtype=GenericQuery, qtype_map=ehealth_map, namespace='clef-ehealth', lang='sv'), EhealthQrels( [dlc['clef-ehealth/2016.qrels'], dlc['clef-ehealth/2017.qrels']], [dlc['clef-ehealth/2016.qtrust'], dlc['clef-ehealth/2017.qtrust']], [dlc['clef-ehealth/2016.qunder'], dlc['clef-ehealth/2017.qreads']], EHEALTH_QREL_DEFS, query_id_suffix='-sv'), documentation('clef-ehealth/sv')) # NOTE: the following datasets are defined in touche.py: # - clueweb12/touche-2020-task-2 # - clueweb12/touche-2021-task-2 ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} v1_collection = PmcDocs( [ dlc['v1/source0'], dlc['v1/source1'], dlc['v1/source2'], dlc['v1/source3'] ], ir_datasets.util.home_path() / NAME / 'v1' / 'corpus', duplicate_dlcs=[dlc['v1/dup1'], dlc['v1/dup2']], count_hint=ir_datasets.util.count_hint(f'{NAME}/v1')) v2_collection = PmcDocs( [ dlc['v2/source0'], dlc['v2/source1'], dlc['v2/source2'], dlc['v2/source3'] ], ir_datasets.util.home_path() / NAME / 'v2' / 'corpus', count_hint=ir_datasets.util.count_hint(f'{NAME}/v2')) base = Dataset(documentation('_')) subsets['v1'] = Dataset(v1_collection, documentation('v1')) subsets['v2'] = Dataset(v2_collection, documentation('v2')) subsets['v1/trec-cds-2014'] = Dataset( v1_collection, TrecXmlQueries(dlc['trec-cds-2014/queries'], TrecCdsQuery, QUERY_FILE_MAP, namespace='trec-cds-2014', lang='en'), TrecQrels(dlc['trec-cds-2014/qrels'], QREL_DEFS), documentation('v1/trec-cds-2014'), ) subsets['v1/trec-cds-2015'] = Dataset( v1_collection, TrecXmlQueries(dlc['trec-cds-2015/queries'], TrecCdsQuery, QUERY_FILE_MAP, namespace='trec-cds-2015', lang='en'), TrecQrels(dlc['trec-cds-2015/qrels'], QREL_DEFS), documentation('v1/trec-cds-2015'), ) subsets['v2/trec-cds-2016'] = Dataset( v2_collection, TrecXmlQueries(dlc['trec-cds-2016/queries'], TrecCds2016Query, QUERY_FILE_MAP, namespace='trec-cds-2016', lang='en'), TrecQrels(dlc['trec-cds-2016/qrels'], QREL_DEFS), documentation('v2/trec-cds-2016'), ) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) migrator = Migrator(base_path / 'irds_version.txt', 'v2', affected_files=[ base_path / 'collection.tsv', base_path / 'collection.tsv.pklz4' ], message=f'Migrating {NAME} (fixing passage encoding)') collection = TsvDocs(Cache( FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path / 'collection.tsv'), namespace='msmarco', lang='en', docstore_size_hint=14373971970, count_hint=ir_datasets.util.count_hint(NAME)) collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path / 'train/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['train/qrels'], QRELS_DEFS), TsvDocPairs(GzipExtract(dlc['train/docpairs'])), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path / 'train/ms.run')), ) subsets['train/triples-v2'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])), subsets['train'].scoreddocs_handler(), ) subsets['train/triples-small'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs( Cache( MapSmallTriplesQidPid( TarExtract(dlc['train/docpairs/small'], 'triples.train.small.tsv'), TarExtract(dlc['collectionandqueries'], 'collection.tsv'), subsets['train'].queries_handler()), base_path / 'train/small.triples.qidpid.tsv')), subsets['train'].scoreddocs_handler(), ) subsets['dev'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path / 'dev/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['dev/qrels'], QRELS_DEFS), ) subsets['dev/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path / 'dev/small/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels( Cache( TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path / 'dev/small/qrels'), QRELS_DEFS), TrecScoredDocs( Cache( ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path / 'dev/ms.run')), ) subsets['eval'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path / 'eval/queries.tsv'), namespace='msmarco', lang='en'), ) subsets['eval/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path / 'eval/small/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path / 'eval/ms.run')), ) subsets['trec-dl-2019'] = Dataset( collection, TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path / 'trec-dl-2019/ms.run')), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path / 'trec-dl-2020/ms.run')), ) # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least # 1 relevance assessment train_judged = Lazy( lambda: {q.query_id for q in subsets['train'].qrels_iter()}) subsets['train/judged'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_judged), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged), subsets['train'], ) dev_judged = Lazy( lambda: {q.query_id for q in subsets['dev'].qrels_iter()}) subsets['dev/judged'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev_judged), subsets['dev'], ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) dl20_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged), FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged), subsets['trec-dl-2020'], ) # split200 -- 200 queries held out from the training data for validation split200 = Lazy(lambda: SPLIT200_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'), subsets['train'], ) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'), subsets['train'], ) # Medical subset def train_med(): with dlc['medmarco_ids'].stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} train_med = Lazy(train_med) subsets['train/medical'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_med), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med), FilteredDocPairs(subsets['train'].docpairs_handler(), train_med), FilteredQrels(subsets['train'].qrels_handler(), train_med), subsets['train'], ) # DL-Hard dl_hard_qrels_migrator = Migrator( base_path / 'trec-dl-hard' / 'irds_version.txt', 'v3', affected_files=[base_path / 'trec-dl-hard' / 'qrels'], message='Updating trec-dl-hard qrels') hard_qids = Lazy(lambda: DL_HARD_QIDS) dl_hard_base_queries = TsvQueries([ Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path / 'trec-dl-2020/queries.tsv') ], namespace='msmarco', lang='en') subsets['trec-dl-hard'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), dl_hard_qrels_migrator( TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)), documentation('trec-dl-hard')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1']) subsets['trec-dl-hard/fold1'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold1')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2']) subsets['trec-dl-hard/fold2'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold2')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3']) subsets['trec-dl-hard/fold3'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold3')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4']) subsets['trec-dl-hard/fold4'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold4')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5']) subsets['trec-dl-hard/fold5'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold5')) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs(dlc['benchmark'], parser='tut', path_globs=['**/docs_grp_*.txt'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) topics_and_qrels = TarExtractAll( dlc['benchmark'], base_path / "topics_and_qrels", path_globs=['**/topics.*.txt', '**/qrels.*.txt']) val_runs = TarExtractAll(dlc['dlfiles'], base_path / "val_runs", path_globs=['**/run.trip.BM25.*.val.txt']) test_runs = TarExtractAll(dlc['dlfiles_runs_test'], base_path / "test_runs", path_globs=['**/run.trip.BM25.*.test.txt']) base = Dataset(collection, documentation('_')) subsets['logs'] = Dataset( TsvDocs(Cache( FixAllarticles(TarExtract(dlc['logs'], 'logs/allarticles.txt')), base_path / 'allarticles-fixed.tsv'), doc_cls=TripClickPartialDoc, lang='en', count_hint=ir_datasets.util.count_hint(f'{NAME}/logs')), TripClickQlogs( TarExtractAll(dlc['logs'], base_path / 'logs', path_globs=['**/*.json'])), documentation('logs')) ### Train subsets['train/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.head.train.txt'), QREL_DEFS), documentation('train/head')) subsets['train/head/dctr'] = Dataset( TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.dctr.head.train.txt'), QREL_DCTR_DEFS), subsets['train/head'], documentation('train/head/dctr')) subsets['train/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.torso.train.txt'), QREL_DEFS), documentation('train/torso')) subsets['train/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.tail.train.txt'), QREL_DEFS), documentation('train/tail')) train_queries = ConcatQueries([ subsets['train/head'].queries_handler(), subsets['train/torso'].queries_handler(), subsets['train/tail'].queries_handler(), ]) train_docpairs = DocPairGenerator( TarExtract(dlc['dlfiles'], 'dlfiles/triples.train.tsv'), collection, train_queries, base_path / 'train.docpairs') subsets['train'] = Dataset( collection, train_queries, ConcatQrels([ subsets['train/head'].qrels_handler(), subsets['train/torso'].qrels_handler(), subsets['train/tail'].qrels_handler(), ]), TsvDocPairs(train_docpairs), documentation('train')) subsets['train/hofstaetter-triples'] = Dataset( collection, train_queries, subsets['train'].qrels_handler(), TsvDocPairs(dlc['hofstaetter-triples']), documentation('train/hofstaetter-triples')) ### Val subsets['val/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.head.val.txt'), QREL_DEFS), TrecScoredDocs( RelativePath(val_runs, 'dlfiles/run.trip.BM25.head.val.txt')), documentation('val/head')) subsets['val/head/dctr'] = Dataset( TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.dctr.head.val.txt'), QREL_DCTR_DEFS), subsets['val/head'], documentation('val/head/dctr')) subsets['val/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.torso.val.txt'), QREL_DEFS), TrecScoredDocs( RelativePath(val_runs, 'dlfiles/run.trip.BM25.torso.val.txt')), documentation('val/torso')) subsets['val/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.tail.val.txt'), QREL_DEFS), TrecScoredDocs( RelativePath(val_runs, 'dlfiles/run.trip.BM25.tail.val.txt')), documentation('val/tail')) subsets['val'] = Dataset( collection, ConcatQueries([ subsets['val/head'].queries_handler(), subsets['val/torso'].queries_handler(), subsets['val/tail'].queries_handler(), ]), ConcatQrels([ subsets['val/head'].qrels_handler(), subsets['val/torso'].qrels_handler(), subsets['val/tail'].qrels_handler(), ]), ConcatScoreddocs([ subsets['val/head'].scoreddocs_handler(), subsets['val/torso'].scoreddocs_handler(), subsets['val/tail'].scoreddocs_handler(), ]), documentation('val')) ### Test subsets['test/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs( RelativePath(test_runs, 'runs_test/run.trip.BM25.head.test.txt')), documentation('val/head')) subsets['test/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs( RelativePath(test_runs, 'runs_test/run.trip.BM25.torso.test.txt')), documentation('test/torso')) subsets['test/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs( RelativePath(test_runs, 'runs_test/run.trip.BM25.tail.test.txt')), documentation('test/tail')) subsets['test'] = Dataset( collection, ConcatQueries([ subsets['test/head'].queries_handler(), subsets['test/torso'].queries_handler(), subsets['test/tail'].queries_handler(), ]), ConcatScoreddocs([ subsets['test/head'].scoreddocs_handler(), subsets['test/torso'].scoreddocs_handler(), subsets['test/tail'].scoreddocs_handler(), ]), documentation('test')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets