def _create_token_set_of_db(db): logger.debug("start creating token set for DB...") if type(db) == str: db = FeverDocDB(db) _token_set = set() for doc_id in tqdm(db.get_non_empty_doc_ids()): doc_lines = db.get_doc_lines(doc_id) for line in doc_lines: tokens = tokenize(clean_text(line)) for token in tokens: if token.lower() in _token_set: continue _token_set.add(token.lower()) return _token_set
def _create_db_vocab_idx(db, _global_dict): # logger = LogHelper.get_logger("_create_db_vocab_idx") logger.debug("start creating vocab indices for DB...") if type(db) == str: db = FeverDocDB(db) _vocab_idx = {} for doc_id in tqdm(db.get_non_empty_doc_ids()): doc_lines = db.get_doc_lines(doc_id) for line in doc_lines: tokens = tokenize(clean_text(line)) for token in tokens: if token.lower() in _vocab_idx: continue if token.lower() in _global_dict: _vocab_idx[token.lower()] = _global_dict[token.lower()] _vocab_idx = sorted(list(_vocab_idx.values())) return _vocab_idx
from common.dataset.reader import JSONLineReader from common.util.random import SimpleRandom from retrieval.fever_doc_db import FeverDocDB from retrieval.filter_uninformative import uninformative parser = argparse.ArgumentParser() parser.add_argument('db_path', type=str, help='/path/to/fever.db') args = parser.parse_args() jlr = JSONLineReader() docdb = FeverDocDB(args.db_path) idx = docdb.get_non_empty_doc_ids() idx = list(filter(lambda item: not uninformative(item), tqdm(idx))) r = SimpleRandom.get_instance() with open("data/fever/test.ns.rand.jsonl", "w+") as f: for line in jlr.read("data/fever-data/test.jsonl"): if line["label"] == "NOT ENOUGH INFO": for evidence_group in line['evidence']: for evidence in evidence_group: evidence[2] = idx[r.next_rand(0, len(idx))] evidence[3] = -1 f.write(json.dumps(line) + "\n")