def _create_token_set_of_db(db):
    logger.debug("start creating token set for DB...")
    if type(db) == str:
        db = FeverDocDB(db)
    _token_set = set()
    for doc_id in tqdm(db.get_non_empty_doc_ids()):
        doc_lines = db.get_doc_lines(doc_id)
        for line in doc_lines:
            tokens = tokenize(clean_text(line))
            for token in tokens:
                if token.lower() in _token_set:
                    continue
                _token_set.add(token.lower())
    return _token_set
def _create_db_vocab_idx(db, _global_dict):
    # logger = LogHelper.get_logger("_create_db_vocab_idx")
    logger.debug("start creating vocab indices for DB...")
    if type(db) == str:
        db = FeverDocDB(db)
    _vocab_idx = {}
    for doc_id in tqdm(db.get_non_empty_doc_ids()):
        doc_lines = db.get_doc_lines(doc_id)
        for line in doc_lines:
            tokens = tokenize(clean_text(line))
            for token in tokens:
                if token.lower() in _vocab_idx:
                    continue
                if token.lower() in _global_dict:
                    _vocab_idx[token.lower()] = _global_dict[token.lower()]
    _vocab_idx = sorted(list(_vocab_idx.values()))
    return _vocab_idx
예제 #3
0
from common.dataset.reader import JSONLineReader
from common.util.random import SimpleRandom
from retrieval.fever_doc_db import FeverDocDB
from retrieval.filter_uninformative import uninformative

parser = argparse.ArgumentParser()
parser.add_argument('db_path', type=str, help='/path/to/fever.db')

args = parser.parse_args()

jlr = JSONLineReader()

docdb = FeverDocDB(args.db_path)

idx = docdb.get_non_empty_doc_ids()
idx = list(filter(lambda item: not uninformative(item), tqdm(idx)))

r = SimpleRandom.get_instance()

with open("data/fever/test.ns.rand.jsonl", "w+") as f:
    for line in jlr.read("data/fever-data/test.jsonl"):
        if line["label"] == "NOT ENOUGH INFO":

            for evidence_group in line['evidence']:
                for evidence in evidence_group:
                    evidence[2] = idx[r.next_rand(0, len(idx))]
                    evidence[3] = -1

        f.write(json.dumps(line) + "\n")