def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) base_path = util.path_dataset(self) global_base_path = "/".join(base_path.split("/")[:-1]) #setup msmarco _base_path = global_base_path + "/msmarco" self.ms_index_stem = indices.AnseriniIndex(os.path.join( _base_path, 'anserini.porter'), stemmer='porter') self.ms_index_doctttttquery_stem = indices.AnseriniIndex( os.path.join(_base_path, 'anserini.doctttttquery.porter'), stemmer='porter') self.ms_doc_store = indices.SqliteDocstore( os.path.join(_base_path, 'docs.sqllite')) #setup cord _base_path = global_base_path + "/covid/2020-07-16" self.cord_index_stem = indices.MultifieldAnseriniIndex( os.path.join(_base_path, 'anserini_multifield'), stemmer='porter', primary_field=config['bs_field']) self.cord_index_stem_2020 = indices.MultifieldAnseriniIndex( os.path.join(_base_path, 'anserini_multifield_2020'), stemmer='porter', primary_field=config['bs_field']) self.cord_doc_store = indices.MultifieldSqliteDocstore( os.path.join(_base_path, 'docs_multifield.sqlite'), primary_field=config['rr_field']) self.msds = msmarco.MsmarcoDataset( self.msmarco_config(self.config['subset'], config), logger, vocab) self.cordds = covid.CovidDataset( self.cord_config(self.config['subset'], config), logger, vocab)
def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) base_path = util.path_dataset(self) global_base_path = "/".join(base_path.split("/")[:-1]) #setup msmarco _base_path = global_base_path + "/msmarco" self.ms_index_stem = indices.AnseriniIndex(os.path.join( _base_path, 'anserini.porter'), stemmer='porter') self.ms_index_doctttttquery_stem = indices.AnseriniIndex( os.path.join(_base_path, 'anserini.doctttttquery.porter'), stemmer='porter') self.ms_doc_store = indices.SqliteDocstore( os.path.join(_base_path, 'docs.sqllite')) #setup microblog _base_path = global_base_path + "/microblog" self.mb_index_stem = indices.AnseriniIndex(os.path.join( _base_path, 'anserini.porter'), stemmer='porter') self.mb_index = indices.AnseriniIndex(os.path.join( _base_path, 'anserini'), stemmer='none') self.mb_doc_store = indices.SqliteDocstore( os.path.join(_base_path, 'docs.sqllite')) self.msds = msmarco.MsmarcoDataset( self.msmarco_config(self.config['subset'], config), logger, vocab) self.mbds = microblog.MicroblogDataset( self.microblog_config(self.config['subset'], config), logger, vocab)
def __init__(self): super().__init__() self.index = indices.AnseriniIndex(self.index.path, stemmer="none", name="fullindex") self.index_stem = indices.AnseriniIndex(self.index_stem.path, name="stemindex") self.doc_store = indices.SqliteDocstore(self.docstore.path)
def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) base_path = util.path_dataset(self) self.index = indices.AnseriniIndex(os.path.join(base_path, 'anserini'), stemmer='none') self.index_stem = indices.AnseriniIndex(os.path.join( base_path, 'anserini.porter'), stemmer='porter') self.doc_store = indices.SqliteDocstore( os.path.join(base_path, 'docs.sqllite'))
def __init__(self, config, vocab, logger): super().__init__(config, logger, vocab) self.index_spanish = indices.AnseriniIndex(os.path.join( util.path_dataset(self), 'anserini.es'), lang=self._lang()) self.doc_store = indices.SqliteDocstore( os.path.join(util.path_dataset(self), 'docs.sqlite'))
def test_build(self): df = plaintext.read_tsv('etc/dummy_datafile.tsv') docs = [indices.RawDoc(did, dtext) for t, did, dtext in df if t == 'doc'] with tempfile.TemporaryDirectory() as tmpdir: idxs = [ (indices.AnseriniIndex(os.path.join(tmpdir, 'anserini')), False), (indices.AnseriniIndex(os.path.join(tmpdir, 'anserini.rawdocs'), store_raw_docs=True), True), (indices.SqliteDocstore(os.path.join(tmpdir, 'sqlite')), True), ] for index, check_raw_docs in idxs: with self.subTest(index=index): self.assertFalse(index.built()) index.build(iter(docs)) self.assertTrue(index.built()) self.assertEqual(index.num_docs(), len(docs)) if check_raw_docs: for doc in docs: self.assertEqual(index.get_raw(doc.did), doc.data['text'])
def _init_iter_collection(self): # Using the trick here from capreolus, pulling document content out of public index: # <https://github.com/capreolus-ir/capreolus/blob/d6ae210b24c32ff817f615370a9af37b06d2da89/capreolus/collection/robust04.yaml#L15> index = indices.AnseriniIndex(f'../Tweets2013') for did in self.logger.pbar(index.docids(), desc='documents'): raw_doc = index.get_raw(did) #dict_doc = json.loads(raw_doc) pattern = '"text":"(.*?)","source":' raw_txt = re.search(pattern, raw_doc).group(1) yield indices.RawDoc(did, raw_txt)
def _init_iter_collection(self): # Using the trick here from capreolus, pulling document content out of public index: # <https://github.com/capreolus-ir/capreolus/blob/d6ae210b24c32ff817f615370a9af37b06d2da89/capreolus/collection/robust04.yaml#L15> with util.download_tmp(**_FILES['index']) as f: fd = f'{f.name}.d' util.extract_tarball(f.name, fd, self.logger, reset_permissions=True) index = indices.AnseriniIndex(f'{fd}/index-robust04-20191213') for did in self.logger.pbar(index.docids(), desc='documents'): raw_doc = index.get_raw(did) yield indices.RawDoc(did, raw_doc)
def test_batch_query(self): df = list(plaintext.read_tsv('etc/dummy_datafile.tsv')) docs = [indices.RawDoc(did, dtext) for t, did, dtext in df if t == 'doc'] queries = [(qid, qtext) for t, qid, qtext in df if t == 'query'] with tempfile.TemporaryDirectory() as tmpdir: idxs = [ indices.AnseriniIndex(os.path.join(tmpdir, 'anserini')), ] models = [ 'bm25', 'bm25_k1-1.5', 'bm25_b-0.2', 'bm25_k1-1.6_b-0.8', 'bm25_rm3', 'bm25_rm3_k1-1.5', 'bm25_rm3_b-0.2', 'bm25_rm3_k1-1.6_b-0.8', 'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2', 'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2_k1-1.5', 'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2_b-0.2', 'bm25_rm3_rm3.fbTerms-2_rm3.fbDocs-2_k1-1.6_b-0.8', 'ql', 'ql_mu-0.4', 'sdm', 'sdm_uw-0.3_ow-0.2_tw-0.5', ] for index in idxs: index.build(docs) for model in models: with self.subTest(index=index, model=model): index.batch_query(queries, model, topk=10) index.batch_query(queries, model, topk=10, quiet=True)
def __init__(self, config, logger, vocab): super().__init__(config, logger, vocab) if config['ds']: ds = ir_datasets.load(config['ds']) if not config['docs_ds']: # HACK: find "parent" dataset that contains same docs handler so we don't re-build the index for the same collection segments = config['ds'].split('/') docs_handler = ds.docs_handler() parent_docs_ds = config['ds'] while len(segments) > 1: segments = segments[:-1] parent_ds = ir_datasets.load('/'.join(segments)) if parent_ds.has_docs() and parent_ds.docs_handler( ) == docs_handler: parent_docs_ds = '/'.join(segments) config['docs_ds'] = parent_docs_ds if not config['queries_ds']: config['queries_ds'] = config['ds'] if config['doc_fields']: if not config['docs_index_fields']: config['docs_index_fields'] = config['doc_fields'] if not config['docs_rerank_fields']: config['docs_rerank_fields'] = config['doc_fields'] if config['query_fields']: if not config['queries_index_fields']: config['queries_index_fields'] = config['query_fields'] if not config['queries_rerank_fields']: config['queries_rerank_fields'] = config['query_fields'] self.docs_ds = ir_datasets.load(config['docs_ds']) self.queries_ds = ir_datasets.load(config['queries_ds']) assert self.docs_ds.has_docs() assert self.queries_ds.has_queries() if not config['docs_index_fields']: config['docs_index_fields'] = ','.join( self.docs_ds.docs_cls()._fields[1:]) self.logger.info( 'auto-filled docs_index_fields as {docs_index_fields}'.format( **config)) if not config['docs_rerank_fields']: config['docs_rerank_fields'] = ','.join( self.docs_ds.docs_cls()._fields[1:]) self.logger.info( 'auto-filled docs_rerank_fields as {docs_rerank_fields}'. format(**config)) if not config['queries_index_fields']: config['queries_index_fields'] = ','.join( self.queries_ds.queries_cls()._fields[1:]) self.logger.info( 'auto-filled queries_index_fields as {queries_index_fields}'. format(**config)) if not config['queries_rerank_fields']: config['queries_rerank_fields'] = ','.join( self.queries_ds.queries_cls()._fields[1:]) self.logger.info( 'auto-filled queries_rerank_fields as {queries_rerank_fields}'. format(**config)) base_path = os.path.join(util.path_dataset(self), sanitize_path(self.config['docs_ds'])) os.makedirs(base_path, exist_ok=True) real_anserini_path = os.path.join( base_path, 'anserini.porter.{docs_index_fields}'.format(**self.config)) os.makedirs(real_anserini_path, exist_ok=True) virtual_anserini_path = '{}.{}'.format( real_anserini_path, sanitize_path(config['queries_ds'])) if not os.path.exists(virtual_anserini_path): os.symlink(real_anserini_path, virtual_anserini_path, target_is_directory=True) self.index = indices.AnseriniIndex(virtual_anserini_path, stemmer='porter') self.doc_store = indices.IrdsDocstore(self.docs_ds.docs_store(), config['docs_rerank_fields'])
def execute(self): idxs = [indices.AnseriniIndex(self.path, stemmer=self.stemmer)] _init_indices_parallel(idxs, _iter_collection(self.collection.path), True)