def init(self, force=False): base_path = util.path_dataset(self) idxs = [self.index, self.index_stem, self.doc_store] self._init_indices_parallel(idxs, self._init_iter_collection(), force) qrels_file = os.path.join(base_path, 'qrels.robust2004.txt') if (force or not os.path.exists(qrels_file)) and self._confirm_dua(): util.download(**_FILES['qrels'], file_name=qrels_file) for fold in FOLDS: fold_qrels_file = os.path.join(base_path, f'{fold}.qrels') if (force or not os.path.exists(fold_qrels_file)): all_qrels = trec.read_qrels_dict(qrels_file) fold_qrels = { qid: dids for qid, dids in all_qrels.items() if qid in FOLDS[fold] } trec.write_qrels_dict(fold_qrels_file, fold_qrels) query_file = os.path.join(base_path, 'topics.txt') if (force or not os.path.exists(query_file)) and self._confirm_dua(): query_file_stream = util.download_stream(**_FILES['queries'], encoding='utf8') with util.finialized_file(query_file, 'wt') as f: plaintext.write_tsv(f, trec.parse_query_format(query_file_stream))
def qrels_path(self, fold_qrels_file): if not fold_qrels_file.is_file(): with self.assessments.path.open("r") as fp: all_qrels = trec.read_qrels_dict(fp) fold_qrels = { qid: dids for qid, dids in all_qrels.items() if qid in self.qids } trec.write_qrels_dict(fold_qrels_file, fold_qrels) return fold_qrels_file
def wrapped(it): with util.finialized_file(file, 'wt') as f: for doc in it: if is_heldout == (doc.did in _HELD_OUT_IDS): trec.write_qrels_dict(f, {doc.did: {doc.did: 1}})
def init(self, force=False): needs_docs = [] for index in [self.index_stem, self.index_stem_2020, self.doc_store]: if force or not index.built(): needs_docs.append(index) if needs_docs and self._confirm_dua(): with contextlib.ExitStack() as stack: doc_iter = self._init_iter_collection() doc_iter = self.logger.pbar(doc_iter, desc='articles') doc_iters = util.blocking_tee(doc_iter, len(needs_docs)) for idx, it in zip(needs_docs, doc_iters): if idx is self.index_stem_2020: it = (d for d in it if '2020' in d.data['date']) stack.enter_context( util.CtxtThread(functools.partial(idx.build, it))) path = os.path.join(util.path_dataset(self), 'rnd1.tsv') if not os.path.exists(path) and self._confirm_dua(): with util.download_tmp('https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml', expected_md5="cf1b605222f45f7dbc90ca8e4d9b2c31") as f, \ util.finialized_file(path, 'wt') as fout: soup = BeautifulSoup(f.read(), 'lxml-xml') for topic in soup.find_all('topic'): qid = topic['number'] plaintext.write_tsv(fout, [ (qid, 'query', topic.find('query').get_text()), (qid, 'quest', topic.find('question').get_text()), (qid, 'narr', topic.find('narrative').get_text()), ]) udel_flag = path + '.includes_udel' if not os.path.exists(udel_flag): with open(path, 'at') as fout, util.finialized_file(udel_flag, 'wt'): with util.download_tmp( 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/topics.covid-round1-udel.xml', expected_md5="2915cf59ae222f0aa20b2a671f67fd7a") as f: soup = BeautifulSoup(f.read(), 'lxml-xml') for topic in soup.find_all('topic'): qid = topic['number'] plaintext.write_tsv(fout, [ (qid, 'udel', topic.find('query').get_text()), ]) path = os.path.join(util.path_dataset(self), 'rnd2.tsv') if not os.path.exists(path) and self._confirm_dua(): with util.download_tmp('https://ir.nist.gov/covidSubmit/data/topics-rnd2.xml', expected_md5="550129e71c83de3fb4d6d29a172c5842") as f, \ util.finialized_file(path, 'wt') as fout: soup = BeautifulSoup(f.read(), 'lxml-xml') for topic in soup.find_all('topic'): qid = topic['number'] plaintext.write_tsv(fout, [ (qid, 'query', topic.find('query').get_text()), (qid, 'quest', topic.find('question').get_text()), (qid, 'narr', topic.find('narrative').get_text()), ]) udel_flag = path + '.includes_udel' if not os.path.exists(udel_flag): with open(path, 'at') as fout, util.finialized_file(udel_flag, 'wt'): with util.download_tmp( 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/topics.covid-round2-udel.xml', expected_md5="a8988734e6f812921d5125249c197985") as f: soup = BeautifulSoup(f.read(), 'lxml-xml') for topic in soup.find_all('topic'): qid = topic['number'] plaintext.write_tsv(fout, [ (qid, 'udel', topic.find('query').get_text()), ]) path = os.path.join(util.path_dataset(self), 'rnd5.tsv') if not os.path.exists(path) and self._confirm_dua(): with util.download_tmp('https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml', expected_md5="0307a37b6b9f1a5f233340a769d538ea") as f, \ util.finialized_file(path, 'wt') as fout: soup = BeautifulSoup(f.read(), 'lxml-xml') for topic in soup.find_all('topic'): qid = topic['number'] plaintext.write_tsv(fout, [ (qid, 'query', topic.find('query').get_text()), (qid, 'quest', topic.find('question').get_text()), (qid, 'narr', topic.find('narrative').get_text()), ]) udel_flag = path + '.includes_udel' if not os.path.exists(udel_flag): with open(path, 'at') as fout, util.finialized_file(udel_flag, 'wt'): with util.download_tmp( 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/topics.covid-round5-udel.xml', expected_md5="966a49487348dc853634bcdd0829fd26") as f: soup = BeautifulSoup(f.read(), 'lxml-xml') for topic in soup.find_all('topic'): qid = topic['number'] plaintext.write_tsv(fout, [ (qid, 'udel', topic.find('query').get_text()), ]) qrels_file = os.path.join(util.path_dataset(self), 'rnd5.qrels') if (force or not os.path.exists(qrels_file)) and self._confirm_dua(): util.download( 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt', qrels_file) for fold in FOLDS: fold_qrels_file = os.path.join(util.path_dataset(self), f'{fold}-rnd5.qrels') if (force or not os.path.exists(fold_qrels_file)): all_qrels = trec.read_qrels_dict(qrels_file) fold_qrels = { qid: dids for qid, dids in all_qrels.items() if str(qid) in FOLDS[fold] } trec.write_qrels_dict(fold_qrels_file, fold_qrels)