def init(self, force=False): base_dir = os.path.join(util.path_dataset(self), self.subset) if self.subset == 'dummy': datafile = os.path.join(base_dir, 'datafile.tsv') qrels = os.path.join(base_dir, 'qrels.txt') if not os.path.exists(datafile): os.symlink(os.path.abspath('etc/dummy_datafile.tsv'), datafile) if not os.path.exists(qrels): os.symlink(os.path.abspath('etc/dummy_qrels.txt'), qrels) needs_datafile = [] if force or not self.index.built(): needs_datafile.append(lambda it: self.index.build( indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc')) if force or not self.index_stem.built(): needs_datafile.append(lambda it: self.index_stem.build( indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc')) if force or not self.doc_store.built(): needs_datafile.append(lambda it: self.doc_store.build( indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc')) query_file = os.path.join(base_dir, 'queries.tsv') if force or not os.path.exists(query_file): needs_datafile.append( lambda it: plaintext.write_tsv(query_file, ( (qid, txt) for t, qid, txt in it if t == 'query'))) if needs_datafile: df_glob = os.path.join(base_dir, 'datafile*.tsv') datafiles = glob(df_glob) while not datafiles: c = util.confirm( f'No data files found. Please move/link data files to {df_glob}.\n' 'Data files should contain both queries and documents in the ' 'following format (one per line):\n' '[query|doc] [TAB] [qid/did] [TAB] [text]') if not c: sys.exit(1) datafiles = glob(df_glob) main_iter = itertools.chain(*(plaintext.read_tsv(df) for df in datafiles)) main_iter = tqdm(main_iter, desc='reading datafiles') iters = util.blocking_tee(main_iter, len(needs_datafile)) with contextlib.ExitStack() as stack: for fn, it in zip(needs_datafile, iters): stack.enter_context( util.CtxtThread(functools.partial(fn, it))) qrels_file = os.path.join(base_dir, 'qrels.txt') while not os.path.exists(qrels_file): c = util.confirm( f'No qrels file found. Please move/link qrels file to {qrels_file}.\n' 'Qrels file should be in the TREC format:\n' '[qid] [SPACE] Q0 [SPACE] [did] [SPACE] [score]') if not c: sys.exit(1)
def _confirm_dua(self): self._has_confirmed_dua = True return self._has_confirmed_dua if self._has_confirmed_dua is None and self.DUA is not None: self._has_confirmed_dua = util.confirm( self.DUA.format(ds_path=util.path_dataset(self))) return self._has_confirmed_dua