def init(self, force=False): base_dir = os.path.join(util.path_dataset(self), self.subset) if self.subset == 'dummy': datafile = os.path.join(base_dir, 'datafile.tsv') qrels = os.path.join(base_dir, 'qrels.txt') if not os.path.exists(datafile): os.symlink(os.path.abspath('etc/dummy_datafile.tsv'), datafile) if not os.path.exists(qrels): os.symlink(os.path.abspath('etc/dummy_qrels.txt'), qrels) needs_datafile = [] if force or not self.index.built(): needs_datafile.append(lambda it: self.index.build( indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc')) if force or not self.index_stem.built(): needs_datafile.append(lambda it: self.index_stem.build( indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc')) if force or not self.doc_store.built(): needs_datafile.append(lambda it: self.doc_store.build( indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc')) query_file = os.path.join(base_dir, 'queries.tsv') if force or not os.path.exists(query_file): needs_datafile.append( lambda it: plaintext.write_tsv(query_file, ( (qid, txt) for t, qid, txt in it if t == 'query'))) if needs_datafile: df_glob = os.path.join(base_dir, 'datafile*.tsv') datafiles = glob(df_glob) while not datafiles: c = util.confirm( f'No data files found. Please move/link data files to {df_glob}.\n' 'Data files should contain both queries and documents in the ' 'following format (one per line):\n' '[query|doc] [TAB] [qid/did] [TAB] [text]') if not c: sys.exit(1) datafiles = glob(df_glob) main_iter = itertools.chain(*(plaintext.read_tsv(df) for df in datafiles)) main_iter = tqdm(main_iter, desc='reading datafiles') iters = util.blocking_tee(main_iter, len(needs_datafile)) with contextlib.ExitStack() as stack: for fn, it in zip(needs_datafile, iters): stack.enter_context( util.CtxtThread(functools.partial(fn, it))) qrels_file = os.path.join(base_dir, 'qrels.txt') while not os.path.exists(qrels_file): c = util.confirm( f'No qrels file found. Please move/link qrels file to {qrels_file}.\n' 'Qrels file should be in the TREC format:\n' '[qid] [SPACE] Q0 [SPACE] [did] [SPACE] [score]') if not c: sys.exit(1)
def _init_iter_collection(self, zipf, collection): with zipf.open(f'wikIR{collection}/documents.csv') as f: f = io.TextIOWrapper(f) f.readline() # head for did, text in self.logger.pbar(plaintext.read_sv(f, ','), desc='documents'): yield indices.RawDoc(did, text)
def _init_iter_collection(self): docs_cls = self.docs_ds.docs_cls() fields = self.config['docs_index_fields'].split(',') assert all(f in docs_cls._fields for f in fields) field_idxs = [docs_cls._fields.index(f) for f in fields] for doc in self.docs_ds.docs_iter(): yield indices.RawDoc(doc.doc_id, '\n'.join(str(doc[i]) for i in field_idxs))
def _init_doctttttquery_iter(self): with util.download_tmp(_SOURCES['doctttttquery-predictions'], expected_md5=_HASHES['doctttttquery-predictions']) as f1, \ util.download_tmp(_SOURCES['collection'], expected_md5=_HASHES['collection']) as f2: with zipfile.ZipFile(f1) as zipf, tarfile.open(fileobj=f2) as tarf: collection_stream = io.TextIOWrapper(tarf.extractfile('collection.tsv')) d5_iter = self._init_doctttttquery_zipf_iter(zipf) for (did, text), d5text in self.logger.pbar(zip(plaintext.read_tsv(collection_stream), d5_iter), desc='documents'): yield indices.RawDoc(did, f'{text} {d5text}')
def _init_iter_collection(self): with util.download_tmp(_SOURCES['collection']) as f: with tarfile.open(fileobj=f) as tarf: collection_stream = io.TextIOWrapper( tarf.extractfile('collection.tsv')) for did, text in self.logger.pbar( plaintext.read_tsv(collection_stream), desc='documents'): yield indices.RawDoc(did, text)
def _init_iter_collection(self): # Using the trick here from capreolus, pulling document content out of public index: # <https://github.com/capreolus-ir/capreolus/blob/d6ae210b24c32ff817f615370a9af37b06d2da89/capreolus/collection/robust04.yaml#L15> index = indices.AnseriniIndex(f'../Tweets2013') for did in self.logger.pbar(index.docids(), desc='documents'): raw_doc = index.get_raw(did) #dict_doc = json.loads(raw_doc) pattern = '"text":"(.*?)","source":' raw_txt = re.search(pattern, raw_doc).group(1) yield indices.RawDoc(did, raw_txt)
def _init_iter_collection(self): # Using the trick here from capreolus, pulling document content out of public index: # <https://github.com/capreolus-ir/capreolus/blob/d6ae210b24c32ff817f615370a9af37b06d2da89/capreolus/collection/robust04.yaml#L15> with util.download_tmp(**_FILES['index']) as f: fd = f'{f.name}.d' util.extract_tarball(f.name, fd, self.logger, reset_permissions=True) index = indices.AnseriniIndex(f'{fd}/index-robust04-20191213') for did in self.logger.pbar(index.docids(), desc='documents'): raw_doc = index.get_raw(did) yield indices.RawDoc(did, raw_doc)
def test_build(self): df = plaintext.read_tsv('etc/dummy_datafile.tsv') docs = [indices.RawDoc(did, dtext) for t, did, dtext in df if t == 'doc'] with tempfile.TemporaryDirectory() as tmpdir: idxs = [ (indices.AnseriniIndex(os.path.join(tmpdir, 'anserini')), False), (indices.AnseriniIndex(os.path.join(tmpdir, 'anserini.rawdocs'), store_raw_docs=True), True), (indices.SqliteDocstore(os.path.join(tmpdir, 'sqlite')), True), ] for index, check_raw_docs in idxs: with self.subTest(index=index): self.assertFalse(index.built()) index.build(iter(docs)) self.assertTrue(index.built()) self.assertEqual(index.num_docs(), len(docs)) if check_raw_docs: for doc in docs: self.assertEqual(index.get_raw(doc.did), doc.data['text'])
def _parse_doc_file(args): path, encoding = args docs = [] if path.endswith('.gz'): open_fn = gzip.open else: open_fn = open with open_fn(path, 'rt', encoding=encoding, errors='replace') as file: docid = None doc_text = '' tag_no = None while file: line = next(file, StopIteration) if line is StopIteration: break if line.startswith('<DOC ') or line.startswith('<DOC>'): match = re.match(r".*id=\"([^\"]+)\".*", line) if match: docid = match.group(1) elif line.startswith('<DOCNO>'): while '</DOCNO>' not in line: l = next(file, StopIteration) if l is StopIteration: break line += l docid = line.replace('<DOCNO>', '').replace('</DOCNO>', '').strip() elif line.startswith('</DOC>'): assert docid is not None docs.append(indices.RawDoc(docid, _strip_html(doc_text))) docid = None doc_text = '' tag_no = None elif tag_no is not None: doc_text += line if line.startswith(DOC_TEXT_END_TAGS[tag_no]): tag_no = None else: for i, tag in enumerate(DOC_TEXT_TAGS): if line.startswith(tag): tag_no = i doc_text += line break return docs
def test_batch_query(self): df = list(plaintext.read_tsv('etc/dummy_datafile.tsv')) docs = [indices.RawDoc(did, dtext) for t, did, dtext in df if t == 'doc'] queries = [(qid, qtext) for t, qid, qtext in df if t == 'query'] with tempfile.TemporaryDirectory() as tmpdir: idxs = [ indices.AnseriniIndex(os.path.join(tmpdir, 'anserini')), ] models = [ 'bm25', 'bm25_k1-1.5', 'bm25_b-0.2', 'bm25_k1-1.6_b-0.8', 'bm25_rm3', 'bm25_rm3_k1-1.5', 'bm25_rm3_b-0.2', 'bm25_rm3_k1-1.6_b-0.8', 'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2', 'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2_k1-1.5', 'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2_b-0.2', 'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2_k1-1.6_b-0.8', 'ql', 'ql_mu-0.4', 'sdm', 'sdm_uw-0.3_ow-0.2_tw-0.5', ] for index in idxs: index.build(docs) for model in models: with self.subTest(index=index, model=model): index.batch_query(queries, model, topk=10) index.batch_query(queries, model, topk=10, quiet=True)
def _init_iter_collection(self): files = { '2020-04-10': { 'comm_use_subset': ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/comm_use_subset.tar.gz', "253cecb4fee2582a611fb77a4d537dc5"), 'noncomm_use_subset': ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/noncomm_use_subset.tar.gz', "734b462133b3c00da578a909f945f4ae"), 'custom_license': ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/custom_license.tar.gz', "2f1c9864348025987523b86d6236c40b"), 'biorxiv_medrxiv': ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/biorxiv_medrxiv.tar.gz', "c12acdec8b3ad31918d752ba3db36121"), }, '2020-05-01': { 'comm_use_subset': ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/comm_use_subset.tar.gz', "af4202340182209881d3d8cba2d58a24"), 'noncomm_use_subset': ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/noncomm_use_subset.tar.gz', "9cc25b9e8674197446e7cbd4381f643b"), 'custom_license': ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/custom_license.tar.gz', "1cb6936a7300a31344cd8a5ecc9ca778"), 'biorxiv_medrxiv': ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/biorxiv_medrxiv.tar.gz', "9d6c6dc5d64b01e528086f6652b3ccb7"), 'arxiv': ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/arxiv.tar.gz', "f10890174d6f864f306800d4b02233bc"), } } metadata = { '2020-04-10': ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-04-10/metadata.csv', "42a21f386be86c24647a41bedde34046"), '2020-05-01': ('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-05-01/metadata.csv', "b1d2e409026494e0c8034278bacd1248"), } meta_url, meta_md5 = metadata[self.config['date']] fulltexts = {} with contextlib.ExitStack() as stack: for fid, (file, md5) in files[self.config['date']].items(): fulltexts[fid] = stack.enter_context( util.download_tmp(file, tarf=True, expected_md5=md5)) meta = pd.read_csv( util.download_stream(meta_url, expected_md5=meta_md5)) for _, row in meta.iterrows(): did = str(row['cord_uid']) title = str(row['title']) doi = str(row['doi']) abstract = str(row['abstract']) date = str(row['publish_time']) body = '' heads = '' if row['has_pmc_xml_parse']: path = os.path.join(row['full_text_file'], 'pmc_json', row['pmcid'] + '.xml.json') data = json.load( fulltexts[row['full_text_file']].extractfile(path)) if 'body_text' in data: body = '\n'.join(b['text'] for b in data['body_text']) heads = '\n'.join( set(b['section'] for b in data['body_text'])) elif row['has_pdf_parse']: path = os.path.join( row['full_text_file'], 'pdf_json', row['sha'].split(';')[0].strip() + '.json') data = json.load( fulltexts[row['full_text_file']].extractfile(path)) if 'body_text' in data: body = '\n'.join(b['text'] for b in data['body_text']) heads = '\n'.join( set(b['section'] for b in data['body_text'])) contents = f'{title}\n\n{abstract}\n\n{body}\n\n{heads}' doc = indices.RawDoc(did, text=contents, title=title, abstract=abstract, title_abs=f'{title}\n\n{abstract}', body=body, doi=doi, date=date) yield doc
def _init_doc_iter(self): with util.download_tmp(_SOURCES['corpus'], tarf=True) as f: cbor_file = f.extract('paragraphcorpus/paragraphcorpus.cbor') for did, text in self.logger.pbar(car.iter_paras(cbor_file), desc='documents'): yield indices.RawDoc(did, text)
def _iter_collection(path): logger = log.easy() with path.open("rt") as collection_stream: for did, text in logger.pbar(plaintext.read_tsv(collection_stream), desc='documents'): yield indices.RawDoc(did, text)
def _init_iter_collection(self): strm = util.download_stream( 'https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt', 'utf8') for did, text in plaintext.read_tsv(strm): yield indices.RawDoc(did, text)