def test_core(self): data_type = namedtuple('data_type', ['doc_id', 'field1', 'field2']) mock_file = StringFile(''' 123\tsome field\tanother field 123\t repeated entry \tshouldn't filter 456\tanother query\tsomething '''.lstrip()) expected_results = [ data_type('123', 'some field', 'another field'), data_type('123', ' repeated entry ', 'shouldn\'t filter'), data_type('456', 'another query', 'something'), ] queries = TsvQueries(mock_file, data_type) self.assertEqual(queries.queries_path(), 'MOCK') self.assertEqual(list(queries.queries_iter()), expected_results) docs = TsvDocs(mock_file, data_type) self.assertEqual(docs.docs_path(), 'MOCK') self.assertEqual(list(docs.docs_iter()), expected_results) docpairs = TsvDocPairs(mock_file, data_type) self.assertEqual(docpairs.docpairs_path(), 'MOCK') self.assertEqual(list(docpairs.docpairs_iter()), expected_results)
def test_flex_columns(self): class data_type(NamedTuple): doc_id: str field1: str field2: Tuple[str, ...] mock_file = StringFile(''' 123\tsome field\tanother field 123\ttoo few fields 456\tanother query\tsomething 456\tanother query\tsomething\ttoo many fields\teven more '''.strip()) expected_results = [ data_type('123', 'some field', ('another field', )), data_type('123', 'too few fields', ()), data_type('456', 'another query', ('something', )), data_type('456', 'another query', ('something', 'too many fields', 'even more')), ] queries = TsvQueries(mock_file, data_type) self.assertEqual(queries.queries_path(), 'MOCK') self.assertEqual(list(queries.queries_iter()), expected_results) docs = TsvDocs(mock_file, data_type) self.assertEqual(docs.docs_path(), 'MOCK') self.assertEqual(list(docs.docs_iter()), expected_results) docpairs = TsvDocPairs(mock_file, data_type) self.assertEqual(docpairs.docpairs_path(), 'MOCK') self.assertEqual(list(docpairs.docpairs_iter()), expected_results)
def _init(): documentation = YamlDocumentation('docs/antique.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) collection = TsvDocs(dlc['docs'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) subsets = {} for subset in ('train', 'test'): qrels = TrecQrels(dlc[f'{subset}/qrels'], QREL_DEFS) queries = TsvQueries(dlc[f'{subset}/queries'], namespace=NAME, lang='en') subsets[subset] = Dataset(collection, queries, qrels) # Split the training data into training and validation data validation_qids = Lazy(lambda: VALIDATION_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), validation_qids, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), validation_qids, mode='exclude'), subsets['train']) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), validation_qids, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), validation_qids, mode='include'), subsets['train']) # Separate test set removing the "offensive (and noisy)" questions disallow_list = dlc['disallow_list'] def disllow_qids(): with disallow_list.stream() as stream: stream = io.TextIOWrapper(stream) return {l.rstrip() for l in stream} disllow_qids = Lazy(disllow_qids) subsets['test/non-offensive'] = Dataset( FilteredQueries(subsets['test'].queries_handler(), disllow_qids, mode='exclude'), FilteredQrels(subsets['test'].qrels_handler(), disllow_qids, mode='exclude'), subsets['test']) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
def _docs_initializer(lang_code): if lang_code not in _docs_cache: dlc = _dlc().context("clirmatrix_docs", base_path) docs = TsvDocs(GzipExtract(dlc[f'docs/{lang_code}']), namespace=f'{NAME}/{lang_code}', lang=lang_code) _docs_cache[lang_code] = docs return _docs_cache[lang_code]
def test_too_few_columns(self): data_type = namedtuple('data_type', ['doc_id', 'field1', 'field2']) mock_file = StringFile(''' 123\tsome field\tanother field 123\ttoo few fields 456\tanother query\tsomething '''.strip()) queries = TsvQueries(mock_file, data_type) with self.assertRaises(RuntimeError): list(queries.queries_iter()) docs = TsvDocs(mock_file, data_type) with self.assertRaises(RuntimeError): list(docs.docs_iter()) docpairs = TsvDocPairs(mock_file, data_type) with self.assertRaises(RuntimeError): list(docpairs.docpairs_iter())
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TsvDocs(GzipExtract(dlc['docs']), doc_cls=DprW100Doc, namespace=NAME, lang='en', skip_first_line=True, docstore_size_hint=12827215492, count_hint=ir_datasets.util.count_hint(NAME)) base = Dataset(collection, documentation('_')) subsets = {} nq_dev_manager = DprW100Manager(GzipExtract(dlc['nq-dev']), base_path / 'nq-dev') subsets['natural-questions/dev'] = Dataset( collection, DprW100Queries(nq_dev_manager.file_ref('queries.tsv')), TrecQrels(nq_dev_manager.file_ref('qrels'), QREL_DEFS), documentation('natural-questions/dev')) nq_train_manager = DprW100Manager(GzipExtract(dlc['nq-train']), base_path / 'nq-train') subsets['natural-questions/train'] = Dataset( collection, DprW100Queries(nq_train_manager.file_ref('queries.tsv')), TrecQrels(nq_train_manager.file_ref('qrels'), QREL_DEFS), documentation('natural-questions/train')) tqa_dev_manager = DprW100Manager(GzipExtract(dlc['tqa-dev']), base_path / 'tqa-dev', passage_id_key='psg_id') subsets['trivia-qa/dev'] = Dataset( collection, DprW100Queries(tqa_dev_manager.file_ref('queries.tsv')), TrecQrels(tqa_dev_manager.file_ref('qrels'), QREL_DEFS), documentation('trivia-qa/dev')) tqa_train_manager = DprW100Manager(GzipExtract(dlc['tqa-train']), base_path / 'tqa-train', passage_id_key='psg_id') subsets['trivia-qa/train'] = Dataset( collection, DprW100Queries(tqa_train_manager.file_ref('queries.tsv')), TrecQrels(tqa_train_manager.file_ref('qrels'), QREL_DEFS), documentation('trivia-qa/train')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_dlc = TarExtractAll(dlc['source'], base_path / 'lotte_extracted') base = Dataset(documentation('_')) subsets = {} domains = [ ('lifestyle', ), ('recreation', ), ('science', ), ('technology', ), ('writing', ), ('pooled', ), ] for (domain, ) in domains: for split in ['dev', 'test']: corpus = TsvDocs(RelativePath( base_dlc, f'lotte/{domain}/{split}/collection.tsv'), lang='en') subsets[f'{domain}/{split}'] = Dataset( corpus, documentation(f'{domain}/{split}')) for qtype in ['search', 'forum']: subsets[f'{domain}/{split}/{qtype}'] = Dataset( corpus, TsvQueries(RelativePath( base_dlc, f'lotte/{domain}/{split}/questions.{qtype}.tsv'), lang='en'), LotteQrels( RelativePath( base_dlc, f'lotte/{domain}/{split}/qas.{qtype}.jsonl')), documentation(f'{domain}/{split}/{qtype}')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path, dua=DUA) migrator = Migrator(base_path / 'irds_version.txt', 'v2', affected_files=[ base_path / 'collection.tsv', base_path / 'collection.tsv.pklz4' ], message=f'Migrating {NAME} (fixing passage encoding)') collection = TsvDocs(Cache( FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path / 'collection.tsv'), namespace='msmarco', lang='en', docstore_size_hint=14373971970, count_hint=ir_datasets.util.count_hint(NAME)) collection = migrator(collection) subsets = {} subsets['train'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path / 'train/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['train/qrels'], QRELS_DEFS), TsvDocPairs(GzipExtract(dlc['train/docpairs'])), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path / 'train/ms.run')), ) subsets['train/triples-v2'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs(GzipExtract(dlc['train/docpairs/v2'])), subsets['train'].scoreddocs_handler(), ) subsets['train/triples-small'] = Dataset( collection, subsets['train'].queries_handler(), subsets['train'].qrels_handler(), TsvDocPairs( Cache( MapSmallTriplesQidPid( TarExtract(dlc['train/docpairs/small'], 'triples.train.small.tsv'), TarExtract(dlc['collectionandqueries'], 'collection.tsv'), subsets['train'].queries_handler()), base_path / 'train/small.triples.qidpid.tsv')), subsets['train'].scoreddocs_handler(), ) subsets['dev'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path / 'dev/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels(dlc['dev/qrels'], QRELS_DEFS), ) subsets['dev/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path / 'dev/small/queries.tsv'), namespace='msmarco', lang='en'), TrecQrels( Cache( TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path / 'dev/small/qrels'), QRELS_DEFS), TrecScoredDocs( Cache( ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path / 'dev/ms.run')), ) subsets['eval'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path / 'eval/queries.tsv'), namespace='msmarco', lang='en'), ) subsets['eval/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path / 'eval/small/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path / 'eval/ms.run')), ) subsets['trec-dl-2019'] = Dataset( collection, TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), namespace='msmarco', lang='en'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path / 'trec-dl-2019/ms.run')), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco', lang='en'), TrecQrels(dlc['trec-dl-2020/qrels'], TREC_DL_QRELS_DEFS), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path / 'trec-dl-2020/ms.run')), ) # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least # 1 relevance assessment train_judged = Lazy( lambda: {q.query_id for q in subsets['train'].qrels_iter()}) subsets['train/judged'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_judged), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged), subsets['train'], ) dev_judged = Lazy( lambda: {q.query_id for q in subsets['dev'].qrels_iter()}) subsets['dev/judged'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev_judged), subsets['dev'], ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) dl20_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2020'].qrels_iter()}) subsets['trec-dl-2020/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2020'].queries_handler(), dl20_judged), FilteredScoredDocs(subsets['trec-dl-2020'].scoreddocs_handler(), dl20_judged), subsets['trec-dl-2020'], ) # split200 -- 200 queries held out from the training data for validation split200 = Lazy(lambda: SPLIT200_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'), subsets['train'], ) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'), subsets['train'], ) # Medical subset def train_med(): with dlc['medmarco_ids'].stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} train_med = Lazy(train_med) subsets['train/medical'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_med), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med), FilteredDocPairs(subsets['train'].docpairs_handler(), train_med), FilteredQrels(subsets['train'].qrels_handler(), train_med), subsets['train'], ) # DL-Hard dl_hard_qrels_migrator = Migrator( base_path / 'trec-dl-hard' / 'irds_version.txt', 'v3', affected_files=[base_path / 'trec-dl-hard' / 'qrels'], message='Updating trec-dl-hard qrels') hard_qids = Lazy(lambda: DL_HARD_QIDS) dl_hard_base_queries = TsvQueries([ Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), Cache(GzipExtract(dlc['trec-dl-2020/queries']), base_path / 'trec-dl-2020/queries.tsv') ], namespace='msmarco', lang='en') subsets['trec-dl-hard'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), dl_hard_qrels_migrator( TrecQrels(dlc['trec-dl-hard/qrels'], TREC_DL_QRELS_DEFS)), documentation('trec-dl-hard')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['1']) subsets['trec-dl-hard/fold1'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold1')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['2']) subsets['trec-dl-hard/fold2'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold2')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['3']) subsets['trec-dl-hard/fold3'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold3')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['4']) subsets['trec-dl-hard/fold4'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold4')) hard_qids = Lazy(lambda: DL_HARD_QIDS_BYFOLD['5']) subsets['trec-dl-hard/fold5'] = Dataset( collection, FilteredQueries(dl_hard_base_queries, hard_qids), FilteredQrels(subsets['trec-dl-hard'], hard_qids), documentation('trec-dl-hard/fold5')) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets
# What to the relevance levels in qrels mean? QREL_DEFS = { 1: 'relevant', 0: 'not relevant', } # Specify where to find the content. Here it's just from the repository, but it could be anywhere. DL_DOCS = ir_datasets.util.RequestsDownload( 'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/docs.tsv' ) DL_QUERIES = ir_datasets.util.RequestsDownload( 'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/queries.tsv' ) DL_QRELS = ir_datasets.util.RequestsDownload( 'https://raw.githubusercontent.com/seanmacavaney/dummy-irds-ext/master/data/qrels' ) # where the content is cached base_path = ir_datasets.util.home_path() / NAME # Dataset definition: it provides docs, queries, and qrels dataset = ir_datasets.Dataset( TsvDocs(ir_datasets.util.Cache(DL_DOCS, base_path / 'docs.tsv')), TsvQueries(ir_datasets.util.Cache(DL_QUERIES, base_path / 'queries.tsv')), TrecQrels(ir_datasets.util.Cache(DL_QRELS, base_path / 'qrels'), QREL_DEFS), ) # Register the dataset with ir_datasets ir_datasets.registry.register(NAME, dataset)
def _init(): base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') main_dlc = dlc['main'] collection = TsvDocs(Cache( TarExtract(main_dlc, 'nfcorpus/raw/doc_dump.txt'), base_path / 'collection.tsv'), doc_cls=NfCorpusDoc, namespace=NAME) subsets = {} def read_lines(file): file = Cache(TarExtract(main_dlc, f'nfcorpus/raw/{file}'), base_path / file) with file.stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} nontopic_qid_filter = Lazy(lambda: read_lines('nontopics.ids')) video_qid_filter = Lazy(lambda: read_lines('all_videos.ids')) subsets['train'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.titles.queries'), base_path / 'train/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.all.queries'), base_path / 'train/queries.all.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels( Cache(TarExtract(main_dlc, 'nfcorpus/train.3-2-1.qrel'), base_path / 'train/qrels'), QRELS_DEFS), documentation('train'), ) subsets['train/nontopic'] = Dataset( collection, TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'), base_path / 'train/nontopic/queries.tsv'), namespace=NAME), FilteredQrels(subsets['train'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('train/nontopic'), ) subsets['train/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.vid-titles.queries'), base_path / 'train/video/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.vid-desc.queries'), base_path / 'train/video/queries.desc.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/train.nontopic-titles.queries'), base_path / 'train/video/queries.tsv'), NfCorpusVideoQuery, namespace=NAME), FilteredQrels(subsets['train'].qrels_handler(), video_qid_filter, mode='include'), documentation('train/video'), ) subsets['dev'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.titles.queries'), base_path / 'dev/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/dev.all.queries'), base_path / 'dev/queries.all.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels( Cache(TarExtract(main_dlc, 'nfcorpus/dev.3-2-1.qrel'), base_path / 'dev/qrels'), QRELS_DEFS), documentation('dev'), ) subsets['dev/nontopic'] = Dataset( collection, TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'), base_path / 'dev/nontopic/queries.tsv'), namespace=NAME), FilteredQrels(subsets['dev'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('dev/nontopic'), ) subsets['dev/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.vid-titles.queries'), base_path / 'dev/video/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.vid-desc.queries'), base_path / 'dev/video/queries.desc.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/dev.nontopic-titles.queries'), base_path / 'dev/video/queries.tsv'), NfCorpusVideoQuery, namespace=NAME), FilteredQrels(subsets['dev'].qrels_handler(), video_qid_filter, mode='include'), documentation('dev/video'), ) subsets['test'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.titles.queries'), base_path / 'test/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache(TarExtract(main_dlc, 'nfcorpus/test.all.queries'), base_path / 'test/queries.all.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusQuery), TrecQrels( Cache(TarExtract(main_dlc, 'nfcorpus/test.3-2-1.qrel'), base_path / 'test/qrels'), QRELS_DEFS), documentation('test'), ) subsets['test/nontopic'] = Dataset( collection, TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'), base_path / 'test/nontopic/queries.tsv'), namespace=NAME), FilteredQrels(subsets['test'].qrels_handler(), nontopic_qid_filter, mode='include'), documentation('test/nontopic'), ) subsets['test/video'] = Dataset( collection, ZipQueries([ TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.vid-titles.queries'), base_path / 'test/video/queries.titles.tsv'), namespace=NAME), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.vid-desc.queries'), base_path / 'test/video/queries.desc.tsv'), namespace=NAME), ], [(0, 0), (0, 1), (1, 1)], NfCorpusVideoQuery), TsvQueries(Cache( TarExtract(main_dlc, 'nfcorpus/test.nontopic-titles.queries'), base_path / 'test/video/queries.tsv'), NfCorpusVideoQuery, namespace=NAME), FilteredQrels(subsets['test'].qrels_handler(), video_qid_filter, mode='include'), documentation('test/video'), ) ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return collection, subsets
def _init(): subsets = {} base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) documentation = YamlDocumentation(f'docs/{NAME}.yaml') collection = TrecDocs(dlc['benchmark'], parser='tut', path_globs=['**/docs_grp_*.txt'], namespace=NAME, lang='en', count_hint=ir_datasets.util.count_hint(NAME)) topics_and_qrels = TarExtractAll( dlc['benchmark'], base_path / "topics_and_qrels", path_globs=['**/topics.*.txt', '**/qrels.*.txt']) val_runs = TarExtractAll(dlc['dlfiles'], base_path / "val_runs", path_globs=['**/run.trip.BM25.*.val.txt']) test_runs = TarExtractAll(dlc['dlfiles_runs_test'], base_path / "test_runs", path_globs=['**/run.trip.BM25.*.test.txt']) base = Dataset(collection, documentation('_')) subsets['logs'] = Dataset( TsvDocs(Cache( FixAllarticles(TarExtract(dlc['logs'], 'logs/allarticles.txt')), base_path / 'allarticles-fixed.tsv'), doc_cls=TripClickPartialDoc, lang='en', count_hint=ir_datasets.util.count_hint(f'{NAME}/logs')), TripClickQlogs( TarExtractAll(dlc['logs'], base_path / 'logs', path_globs=['**/*.json'])), documentation('logs')) ### Train subsets['train/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.head.train.txt'), QREL_DEFS), documentation('train/head')) subsets['train/head/dctr'] = Dataset( TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.dctr.head.train.txt'), QREL_DCTR_DEFS), subsets['train/head'], documentation('train/head/dctr')) subsets['train/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.torso.train.txt'), QREL_DEFS), documentation('train/torso')) subsets['train/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.train.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.tail.train.txt'), QREL_DEFS), documentation('train/tail')) train_queries = ConcatQueries([ subsets['train/head'].queries_handler(), subsets['train/torso'].queries_handler(), subsets['train/tail'].queries_handler(), ]) train_docpairs = DocPairGenerator( TarExtract(dlc['dlfiles'], 'dlfiles/triples.train.tsv'), collection, train_queries, base_path / 'train.docpairs') subsets['train'] = Dataset( collection, train_queries, ConcatQrels([ subsets['train/head'].qrels_handler(), subsets['train/torso'].qrels_handler(), subsets['train/tail'].qrels_handler(), ]), TsvDocPairs(train_docpairs), documentation('train')) subsets['train/hofstaetter-triples'] = Dataset( collection, train_queries, subsets['train'].qrels_handler(), TsvDocPairs(dlc['hofstaetter-triples']), documentation('train/hofstaetter-triples')) ### Val subsets['val/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.head.val.txt'), QREL_DEFS), TrecScoredDocs( RelativePath(val_runs, 'dlfiles/run.trip.BM25.head.val.txt')), documentation('val/head')) subsets['val/head/dctr'] = Dataset( TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.dctr.head.val.txt'), QREL_DCTR_DEFS), subsets['val/head'], documentation('val/head/dctr')) subsets['val/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.torso.val.txt'), QREL_DEFS), TrecScoredDocs( RelativePath(val_runs, 'dlfiles/run.trip.BM25.torso.val.txt')), documentation('val/torso')) subsets['val/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.val.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecQrels( RelativePath(topics_and_qrels, 'benchmark/qrels/qrels.raw.tail.val.txt'), QREL_DEFS), TrecScoredDocs( RelativePath(val_runs, 'dlfiles/run.trip.BM25.tail.val.txt')), documentation('val/tail')) subsets['val'] = Dataset( collection, ConcatQueries([ subsets['val/head'].queries_handler(), subsets['val/torso'].queries_handler(), subsets['val/tail'].queries_handler(), ]), ConcatQrels([ subsets['val/head'].qrels_handler(), subsets['val/torso'].qrels_handler(), subsets['val/tail'].qrels_handler(), ]), ConcatScoreddocs([ subsets['val/head'].scoreddocs_handler(), subsets['val/torso'].scoreddocs_handler(), subsets['val/tail'].scoreddocs_handler(), ]), documentation('val')) ### Test subsets['test/head'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.head.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs( RelativePath(test_runs, 'runs_test/run.trip.BM25.head.test.txt')), documentation('val/head')) subsets['test/torso'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.torso.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs( RelativePath(test_runs, 'runs_test/run.trip.BM25.torso.test.txt')), documentation('test/torso')) subsets['test/tail'] = Dataset( collection, TrecQueries(RelativePath(topics_and_qrels, 'benchmark/topics/topics.tail.test.txt'), qtype=GenericQuery, qtype_map=QTYPE_MAP, namespace=NAME, lang='en'), TrecScoredDocs( RelativePath(test_runs, 'runs_test/run.trip.BM25.tail.test.txt')), documentation('test/tail')) subsets['test'] = Dataset( collection, ConcatQueries([ subsets['test/head'].queries_handler(), subsets['test/torso'].queries_handler(), subsets['test/tail'].queries_handler(), ]), ConcatScoreddocs([ subsets['test/head'].scoreddocs_handler(), subsets['test/torso'].scoreddocs_handler(), subsets['test/tail'].scoreddocs_handler(), ]), documentation('test')) ir_datasets.registry.register(NAME, base) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return base, subsets
def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') base_path = ir_datasets.util.home_path() / NAME dlc = DownloadConfig.context(NAME, base_path) subsets = {} train_qrels = ir_datasets.registry['msmarco-passage/train'].qrels_handler() train_docparis = TsvDocPairs(dlc['train/triples']) dev_qrels = TrecQrels(dlc['dev/qrels'], QRELS_DEFS) dev_small_qrels = TrecQrels(dlc['dev/qrels-small'], QRELS_DEFS) small_dev_qids = Lazy( lambda: {q.query_id for q in dev_small_qrels.qrels_iter()}) for lang in ['es', 'fr', 'pt', 'it', 'id', 'de', 'ru', 'zh']: collection = TsvDocs( dlc[f'{lang}/docs'], namespace=f'mmarco/{lang}', lang=lang, count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}')) subsets[f'{lang}'] = Dataset(collection, documentation(f'{lang}')) subsets[f'{lang}/train'] = Dataset( collection, TsvQueries(dlc[f'{lang}/queries/train'], namespace=f'mmarco/{lang}', lang=lang), train_qrels, train_docparis, documentation(f'{lang}/train')) subsets[f'{lang}/dev'] = Dataset( collection, TsvQueries(dlc[f'{lang}/queries/dev'], namespace=f'mmarco/{lang}', lang=lang), dev_qrels, documentation(f'{lang}/dev')) subsets[f'{lang}/dev/small'] = Dataset( collection, FilteredQueries(subsets[f'{lang}/dev'].queries_handler(), small_dev_qids, mode='include'), dev_small_qrels, TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev']) if lang not in ('zh', 'pt') else None, documentation(f'{lang}/dev/small')) if lang in ('zh', 'pt'): subsets[f'{lang}/dev/v1.1'] = Dataset( collection, TsvQueries(dlc[f'{lang}/queries/dev/v1.1'], namespace=f'mmarco/{lang}', lang=lang), dev_qrels, documentation(f'{lang}/dev/v1.1')) subsets[f'{lang}/dev/small/v1.1'] = Dataset( collection, FilteredQueries(subsets[f'{lang}/dev/v1.1'].queries_handler(), small_dev_qids, mode='include'), dev_small_qrels, TrecScoredDocs(dlc[f'{lang}/scoreddocs/dev/v1.1']), documentation(f'{lang}/dev/v1.1')) if lang in ('pt', ): subsets[f'{lang}/train/v1.1'] = Dataset( collection, TsvQueries(dlc[f'{lang}/queries/train/v1.1'], namespace=f'mmarco/{lang}', lang=lang), train_qrels, train_docparis, documentation(f'{lang}/train/v1.1')) for lang in [ 'ar', 'zh', 'dt', 'fr', 'de', 'hi', 'id', 'it', 'ja', 'pt', 'ru', 'es', 'vi' ]: collection = TsvDocs( dlc[f'v2/{lang}/docs'], namespace=f'mmarco/{lang}', lang=lang, count_hint=ir_datasets.util.count_hint(f'{NAME}/v2/{lang}')) subsets[f'v2/{lang}'] = Dataset(collection, documentation(f'v2/{lang}')) subsets[f'v2/{lang}/train'] = Dataset( collection, TsvQueries(dlc[f'v2/{lang}/queries/train'], namespace=f'mmarco/v2/{lang}', lang=lang), train_qrels, train_docparis, documentation(f'v2/{lang}/train')) subsets[f'v2/{lang}/dev'] = Dataset( collection, TsvQueries(dlc[f'v2/{lang}/queries/dev'], namespace=f'v2/mmarco/{lang}', lang=lang), dev_qrels, documentation(f'v2/{lang}/dev')) subsets[f'v2/{lang}/dev/small'] = Dataset( collection, FilteredQueries(subsets[f'v2/{lang}/dev'].queries_handler(), small_dev_qids, mode='include'), dev_small_qrels, TrecScoredDocs(dlc[f'v2/{lang}/scoreddocs/dev'], negate_score=True), documentation(f'v2/{lang}/dev/small')) ir_datasets.registry.register(NAME, Dataset(documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) return collection, subsets
def _init(): documentation = YamlDocumentation('docs/msmarco-passage.yaml') base_path = ir_datasets.util.home_path() / 'msmarco-passage' dlc = DownloadConfig.context('msmarco-passage', base_path, dua=DUA) collection = TsvDocs(Cache( FixEncoding(TarExtract(dlc['collectionandqueries'], 'collection.tsv')), base_path / 'collection.tsv'), namespace='msmarco') subsets = {} subsets['train'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.train.tsv'), base_path / 'train/queries.tsv'), namespace='msmarco'), TrecQrels(dlc['train/qrels'], QRELS_DEFS), TsvDocPairs(GzipExtract(dlc['train/docpairs'])), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['train/scoreddocs'], 'top1000.train.txt')), base_path / 'train/ms.run')), ) subsets['dev'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.dev.tsv'), base_path / 'dev/queries.tsv'), namespace='msmarco'), TrecQrels(dlc['dev/qrels'], QRELS_DEFS), TrecScoredDocs( Cache( ExtractQidPid(TarExtract(dlc['dev/scoreddocs'], 'top1000.dev')), base_path / 'dev/ms.run')), ) subsets['dev/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.dev.small.tsv'), base_path / 'dev/small/queries.tsv'), namespace='msmarco'), TrecQrels( Cache( TarExtract(dlc['collectionandqueries'], 'qrels.dev.small.tsv'), base_path / 'dev/small/qrels'), QRELS_DEFS), ) subsets['eval'] = Dataset( collection, TsvQueries(Cache(TarExtract(dlc['queries'], 'queries.eval.tsv'), base_path / 'eval/queries.tsv'), namespace='msmarco'), TrecScoredDocs( Cache( ExtractQidPid( TarExtract(dlc['eval/scoreddocs'], 'top1000.eval')), base_path / 'eval/ms.run')), ) subsets['eval/small'] = Dataset( collection, TsvQueries(Cache( TarExtract(dlc['collectionandqueries'], 'queries.eval.small.tsv'), base_path / 'eval/small/queries.tsv'), namespace='msmarco'), ) subsets['trec-dl-2019'] = Dataset( collection, TrecQrels(dlc['trec-dl-2019/qrels'], TREC_DL_QRELS_DEFS), TsvQueries(Cache(GzipExtract(dlc['trec-dl-2019/queries']), base_path / 'trec-dl-2019/queries.tsv'), namespace='msmarco'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2019/scoreddocs'])), base_path / 'trec-dl-2019/ms.run')), ) subsets['trec-dl-2020'] = Dataset( collection, TsvQueries(GzipExtract(dlc['trec-dl-2020/queries']), namespace='msmarco'), TrecScoredDocs( Cache(ExtractQidPid(GzipExtract(dlc['trec-dl-2020/scoreddocs'])), base_path / 'trec-dl-2020/ms.run')), ) # A few subsets that are contrainted to just the queries/qrels/docpairs that have at least # 1 relevance assessment train_judged = Lazy( lambda: {q.query_id for q in subsets['train'].qrels_iter()}) subsets['train/judged'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_judged), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_judged), subsets['train'], ) dev_judged = Lazy( lambda: {q.query_id for q in subsets['dev'].qrels_iter()}) subsets['dev/judged'] = Dataset( FilteredQueries(subsets['dev'].queries_handler(), dev_judged), FilteredScoredDocs(subsets['dev'].scoreddocs_handler(), dev_judged), subsets['dev'], ) dl19_judged = Lazy( lambda: {q.query_id for q in subsets['trec-dl-2019'].qrels_iter()}) subsets['trec-dl-2019/judged'] = Dataset( FilteredQueries(subsets['trec-dl-2019'].queries_handler(), dl19_judged), FilteredScoredDocs(subsets['trec-dl-2019'].scoreddocs_handler(), dl19_judged), subsets['trec-dl-2019'], ) # split200 -- 200 queries held out from the training data for validation split200 = Lazy(lambda: SPLIT200_QIDS) subsets['train/split200-train'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='exclude'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='exclude'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='exclude'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='exclude'), subsets['train'], ) subsets['train/split200-valid'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), split200, mode='include'), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), split200, mode='include'), FilteredQrels(subsets['train'].qrels_handler(), split200, mode='include'), FilteredDocPairs(subsets['train'].docpairs_handler(), split200, mode='include'), subsets['train'], ) # Medical subset def train_med(): with dlc['medmarco_ids'].stream() as stream: stream = codecs.getreader('utf8')(stream) return {l.rstrip() for l in stream} train_med = Lazy(train_med) subsets['train/medical'] = Dataset( FilteredQueries(subsets['train'].queries_handler(), train_med), FilteredScoredDocs(subsets['train'].scoreddocs_handler(), train_med), FilteredDocPairs(subsets['train'].docpairs_handler(), train_med), FilteredQrels(subsets['train'].qrels_handler(), train_med), subsets['train'], ) ir_datasets.registry.register('msmarco-passage', Dataset(collection, documentation('_'))) for s in sorted(subsets): ir_datasets.registry.register(f'msmarco-passage/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets