Пример #1
0
def number_feature(data_set_path: str, db_path: str, max_sent_num: int):
    from common.dataset.reader import JSONLineReader
    db = FeverDocDB(db_path)
    jlr = JSONLineReader()
    lines = jlr.read(data_set_path)
    num_feat = np.zeros([len(lines), max_sent_num, 3], dtype=np.int32)
    for i, line in enumerate(lines):
        claim_text = line['claim']
        claim_tokens = tokenize(claim_text)
        all_nums = set()
        for token in claim_tokens:
            if is_token_numeric(token):
                all_nums.add(float(token))
        for j, evidence in enumerate(line['predicted_evidence']):
            if j >= max_sent_num:
                break
            page, line_num = evidence[-2], evidence[-1]
            all_evidence_nums = []
            evidence_text = evidence_num_to_text(db, page, line_num)
            evidence_tokens = tokenize(evidence_text)
            for token in evidence_tokens:
                if is_token_numeric(token):
                    all_evidence_nums.append(float(token))
            has_num = len(all_evidence_nums) > 0
            has_identical_num = any(n in all_nums for n in all_evidence_nums)
            has_different_num = any(n not in all_nums
                                    for n in all_evidence_nums)
            num_feat[i][j][0], num_feat[i][j][1], num_feat[i][j][
                2] = _interprete_num_result(has_num, has_identical_num,
                                            has_different_num)
    return num_feat
Пример #2
0
def single_sentence_set_2_ids_given_vocab(texts, vocab_dict):
    logger = LogHelper.get_logger("single_sentence_set_2_ids_given_vocab")
    doc_ids = []
    out_of_vocab_counts = 0
    for sent in texts:
        tokens = tokenize(sent)
        word_ids = []
        for token in tokens:
            if token.lower() in vocab_dict:
                word_ids.append(vocab_dict[token.lower()])
            else:
                out_of_vocab_counts += 1
                word_ids.append(vocab_dict['UNK'])
        doc_ids.append(word_ids)
    logger.debug("{} times out of vocab".format(str(out_of_vocab_counts)))
    return doc_ids
Пример #3
0
def single_sentence_set_2_fasttext_embedded(sents: List[str],
                                            fasttext_model: Union[str,
                                                                  FastText]):
    logger = LogHelper.get_logger("single_sentence_set_2_fasttext_embedded")
    if type(fasttext_model) == str:
        fasttext_model = FastText.load_fasttext_format(fasttext_model)
    fasttext_embeddings = []
    for sent in sents:
        tokens = tokenize(sent)
        sent_embeddings = []
        for token in tokens:
            try:
                sent_embeddings.append(fasttext_model[token.lower()])
            except KeyError:
                sent_embeddings.append(np.ones([dim_fasttext], np.float32))
        fasttext_embeddings.append(sent_embeddings)
    return fasttext_embeddings, fasttext_model
    voc_dict['UNK'] = 1
    return voc_dict


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('db', help='/path/to/db/file')
    parser.add_argument('output', help='/path/to/output/pickle/file')
    args = parser.parse_args()
    LogHelper.setup()
    logger = LogHelper.get_logger("generate_vocab_all_wiki")
    db = FeverDocDB(args.db)
    vocab = set()
    for doc in tqdm(db.get_doc_ids()):
        lines = db.get_doc_lines(doc)
        lines = lines.split("\n")
        for line in lines:
            segments = line.split("\t")
            if len(segments) < 2:
                continue
            line = segments[1]
            if line.strip() == "":
                continue
            tokens = set(token.lower() for token in tokenize(clean_text(line)))
            vocab.update(tokens)
    logger.info("total size of vocab: " + str(len(vocab)))
    vocab_dict = vocab_map(vocab)
    del vocab
    with open(args.output, 'wb') as f:
        pickle.dump(vocab_dict, f, protocol=pickle.HIGHEST_PROTOCOL)