示例#1
0
trQ_ids = token2id(trQ_tokens, tr_vocab, unk_id=1)
trQ_tokens = [["<S>"] + doc + ["</S>"] for doc in trQ_tokens]
trQ_ids = [[2] + qsent + [3] for qsent in trQ_ids]
# print(trQ_ids[:10])
# tags
vocab_tag = [''] + list(nlp.tagger.labels)
trC_tag_ids = token2id(trC_tags, vocab_tag)
# entities
vocab_ent = list(set([ent for sent in trC_ents for ent in sent]))
trC_ent_ids = token2id(trC_ents, vocab_ent, unk_id=0)

log.info('Found {} POS tags.'.format(len(vocab_tag)))
log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent))
log.info('vocabulary for training is built.')

tr_embedding = build_embedding(wv_file, tr_vocab, wv_dim)
log.info('got embedding matrix for training.')

meta = {
    'vocab': tr_vocab,
    'embedding': tr_embedding.tolist()
}
with open('CoQA/train_meta.msgpack', 'wb') as f:
    msgpack.dump(meta, f)

prev_CID, first_question = -1, []
for i, CID in enumerate(train.context_idx):
    if not (CID == prev_CID):
        first_question.append(i)
    prev_CID = CID
示例#2
0
# tokens
testC_tokens = [[normalize_text(w.text) for w in doc] for doc in testC_docs]
testQ_tokens = [[normalize_text(w.text) for w in doc] for doc in testQ_docs]
test_vocab = build_test_vocab(
    testQ_tokens, testC_tokens)  # tr_vocab is a subset of test_vocab
testC_ids = token2id(testC_tokens, test_vocab, unk_id=1)
testQ_ids = token2id(testQ_tokens, test_vocab, unk_id=1)
# tags
vocab_tag = list(nlp.tagger.tag_names)
testC_tag_ids = token2id(testC_tags, vocab_tag)  # vocab_tag same as training
# entities
vocab_ent = [''] + nlp.entity.cfg[u'actions']['1']
testC_ent_ids = token2id(testC_ents, vocab_ent)  # vocab_ent same as training
log.info('vocabulary for test is built.')

test_embedding = build_embedding(wv_file, test_vocab, wv_dim)
# tr_embedding is a submatrix of test_embedding
log.info('got embedding matrix for test.')

# don't store row name in csv
test.to_csv('SQuAD/test.csv', index=False, encoding='utf8')

meta = {'vocab': test_vocab, 'embedding': test_embedding.tolist()}
with open('SQuAD/test_meta.msgpack', 'wb') as f:
    msgpack.dump(meta, f)

result = {
    'question_ids': testQ_ids,
    'context_ids': testC_ids,
    'context_features': testC_features,  # exact match, tf
    'context_tags': testC_tag_ids,  # POS tagging
示例#3
0
    trC_tags, vocab_tag)  #Context中的每个单词对应到id之后,没有单词是有tag的,这个tag也对应到tag_id上
# entities
vocab_ent = list(set([ent for sent in trC_ents
                      for ent in sent]))  #在context中所出现的所有的命名实体类型全部归入到list中
# print("vocab_ent = ",vocab_ent)
# exit(789)

trC_ent_ids = token2id(trC_ents, vocab_ent, unk_id=0)  #这里怎么还会需要unk_id

log.info('Found {} POS tags.'.format(len(vocab_tag)))
log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent))
log.info('vocabulary for training is built.')
# exit(89)
# 接下来该看怎么获得向量的了

tr_embedding = build_embedding(wv_file, tr_vocab,
                               wv_dim)  #tr_vocab实际上是一个list ,返回的是一个numpy的array
log.info('got embedding matrix for training.')

meta = {'vocab': tr_vocab, 'embedding': tr_embedding.tolist()}
with open('CoQA/train_meta.msgpack', 'wb') as f:
    msgpack.dump(meta, f)

prev_CID, first_question = -1, []
for i, CID in enumerate(train.context_idx):
    if not (CID == prev_CID):
        first_question.append(i)
    prev_CID = CID

result = {
    'question_ids': trQ_ids,
    'context_ids': trC_ids,
示例#4
0
def preprocess_eval_data(filename, output_msgpack):
    EvalData = process_jsonlines(filename)

    filename = os.path.basename(filename)
    log.info(filename + ' flattened.')

    EvalDataP_iter = (pre_proc(p) for p in EvalData.P)
    EvalDataH_iter = (pre_proc(h) for h in EvalData.H)
    EvalDataP_docs = [
        doc for doc in nlp.pipe(
            EvalDataP_iter, batch_size=64, n_threads=args.threads)
    ]
    EvalDataH_docs = [
        doc for doc in nlp.pipe(
            EvalDataH_iter, batch_size=64, n_threads=args.threads)
    ]

    # tokens
    EvalDataP_tokens = [[normalize_text(w.text) for w in doc]
                        for doc in EvalDataP_docs]
    EvalDataH_tokens = [[normalize_text(w.text) for w in doc]
                        for doc in EvalDataH_docs]
    log.info('All tokens for ' + filename + ' are obtained.')

    # features
    EvalDataP_tags, EvalDataP_ents, EvalDataP_features = feature_gen(
        EvalDataP_docs, EvalDataH_docs)
    EvalDataH_tags, EvalDataH_ents, EvalDataH_features = feature_gen(
        EvalDataH_docs, EvalDataP_docs)
    log.info('features for ' + filename + ' is generated.')

    def build_EvalData_vocab(A, B):  # most vocabulary comes from tr_vocab
        existing_vocab = set(tr_vocab)
        new_vocab = list(
            set([
                w for doc in A + B for w in doc
                if w not in existing_vocab and w in glove_vocab
            ]))
        vocab = tr_vocab + new_vocab
        log.info('train vocab {0}, total vocab {1}'.format(
            len(tr_vocab), len(vocab)))
        return vocab

    # vocab
    EvalData_vocab = build_EvalData_vocab(
        EvalDataP_tokens,
        EvalDataH_tokens)  # tr_vocab is a subset of EvalData_vocab
    EvalDataP_ids = token2id(EvalDataP_tokens, EvalData_vocab, unk_id=1)
    EvalDataH_ids = token2id(EvalDataH_tokens, EvalData_vocab, unk_id=1)

    # tags
    EvalDataP_tag_ids = token2id(EvalDataP_tags, vocab_tag)
    EvalDataH_tag_ids = token2id(EvalDataH_tags,
                                 vocab_tag)  # vocab_tag same as training

    # entities
    EvalDataP_ent_ids = token2id(EvalDataP_ents,
                                 vocab_ent)  # vocab_ent same as training
    EvalDataH_ent_ids = token2id(EvalDataH_ents,
                                 vocab_ent)  # vocab_ent same as training
    log.info('vocabulary for ' + filename + ' is built.')

    EvalData_embedding = build_embedding(
        wv_file, EvalData_vocab,
        wv_dim)  # tr_embedding is a submatrix of EvalData_embedding
    log.info('got embedding matrix for ' + filename)

    result = {
        'premise_ids': EvalDataP_ids,
        'premise_features': EvalDataP_features,  # exact match, tf
        'premise_tags': EvalDataP_tag_ids,  # POS tagging
        'premise_ents': EvalDataP_ent_ids,  # Entity recognition
        'hypothesis_ids': EvalDataH_ids,
        'hypothesis_features': EvalDataH_features,  # exact match, tf
        'hypothesis_tags': EvalDataH_tag_ids,  # POS tagging
        'hypothesis_ents': EvalDataH_ent_ids,  # Entity recognition
        'vocab': EvalData_vocab,
        'embedding': EvalData_embedding.tolist(),
        'answers': EvalData.label
    }
    with open(output_msgpack, 'wb') as f:
        msgpack.dump(result, f)

    log.info('saved ' + output_msgpack + ' to disk.')
示例#5
0
文件: test_e2e.py 项目: Yash-5/FlowQA
def preprocess_data(dev_file):
    dev, dev_context = flatten_json(dev_file, proc_dev)

    dev = pd.DataFrame(dev,
                       columns=[
                           'context_idx', 'question', 'answer', 'answer_start',
                           'answer_end', 'answer_choice', 'all_answer', 'qid'
                       ])
    print('dev json data flattened.')

    devC_iter = (pre_proc(c) for c in dev_context)
    devQ_iter = (pre_proc(q) for q in dev.question)
    nlp = spacy.load('en', disable=['parser'])
    devC_docs = [
        doc for doc in nlp.pipe(
            devC_iter, batch_size=64, n_threads=multiprocessing.cpu_count())
    ]
    devQ_docs = [
        doc for doc in nlp.pipe(
            devQ_iter, batch_size=64, n_threads=multiprocessing.cpu_count())
    ]
    del nlp

    devC_tokens = [[normalize_text(w.text) for w in doc] for doc in devC_docs]
    devQ_tokens = [[normalize_text(w.text) for w in doc] for doc in devQ_docs]
    devC_unnorm_tokens = [[w.text for w in doc] for doc in devC_docs]
    print('All tokens for dev are obtained.')

    dev_context_span = [
        get_context_span(a, b) for a, b in zip(dev_context, devC_unnorm_tokens)
    ]
    print('context span for dev is generated.')

    ans_st_token_ls, ans_end_token_ls = [], []
    for ans_st, ans_end, idx in zip(dev.answer_start, dev.answer_end,
                                    dev.context_idx):
        ans_st_token, ans_end_token = find_answer_span(dev_context_span[idx],
                                                       ans_st, ans_end)
        ans_st_token_ls.append(ans_st_token)
        ans_end_token_ls.append(ans_end_token)

    dev['answer_start_token'], dev[
        'answer_end_token'] = ans_st_token_ls, ans_end_token_ls
    initial_len = len(dev)
    dev.dropna(inplace=True)  # modify self DataFrame
    print('drop {0}/{1} inconsistent samples.'.format(initial_len - len(dev),
                                                      initial_len))
    print('answer span for dev is generated.')

    devC_tags, devC_ents, devC_features = feature_gen(devC_docs,
                                                      dev.context_idx,
                                                      devQ_docs, False)
    print('features for dev is generated: {}, {}, {}'.format(
        len(devC_tags), len(devC_ents), len(devC_features)))

    dev_vocab = build_dev_vocab(
        devQ_tokens, devC_tokens)  # tr_vocab is a subset of dev_vocab
    devC_ids = token2id(devC_tokens, dev_vocab, unk_id=1)
    devQ_ids = token2id(devQ_tokens, dev_vocab, unk_id=1)
    devQ_tokens = [["<S>"] + doc + ["</S>"] for doc in devQ_tokens]
    devQ_ids = [[2] + qsent + [3] for qsent in devQ_ids]

    # BERT stuff
    devC_bert_tokens = tokenize(devC_tokens)
    devC_bert_ids = [bert_tokens_to_ids(x) for x in devC_bert_tokens]
    devQ_bert_tokens = tokenize(devQ_tokens)
    devQ_bert_ids = [bert_tokens_to_ids(x) for x in devQ_bert_tokens]

    devC_bert_spans = [
        calc_bert_spans(b, t) for b, t in zip(devC_bert_tokens, devC_tokens)
    ]
    devQ_bert_spans = [
        calc_bert_spans(b, t) for b, t in zip(devQ_bert_tokens, devQ_tokens)
    ]

    vocab_tag = pickle.load(open('./vocab_tag.pkl', 'rb'))
    vocab_ent = pickle.load(open('./vocab_ent.pkl', 'rb'))

    devC_tag_ids = token2id(devC_tags, vocab_tag)  # vocab_tag same as training
    # entities
    devC_ent_ids = token2id(devC_ents, vocab_ent,
                            unk_id=0)  # vocab_ent same as training
    print('vocabulary for dev is built.')

    dev_embedding = build_embedding('glove/glove.840B.300d.txt', dev_vocab,
                                    300)

    meta = {'vocab': dev_vocab, 'embedding': dev_embedding.tolist()}

    prev_CID, first_question = -1, []
    for i, CID in enumerate(dev.context_idx):
        if not (CID == prev_CID):
            first_question.append(i)
        prev_CID = CID

    result = {
        'qids': dev.qid.tolist(),
        'question_ids': devQ_ids,
        'context_ids': devC_ids,
        'context_features': devC_features,  # exact match, tf
        'context_tags': devC_tag_ids,  # POS tagging
        'context_ents': devC_ent_ids,  # Entity recognition
        'context': dev_context,
        'context_span': dev_context_span,
        '1st_question': first_question,
        'question_CID': dev.context_idx.tolist(),
        'question': dev.question.tolist(),
        'answer': dev.answer.tolist(),
        'answer_start': dev.answer_start_token.tolist(),
        'answer_end': dev.answer_end_token.tolist(),
        'answer_choice': dev.answer_choice.tolist(),
        'all_answer': dev.all_answer.tolist(),
        'context_tokenized': devC_tokens,
        'question_tokenized': devQ_tokens,
        'context_bertidx': devC_bert_ids,
        'context_bert_spans': devC_bert_spans,
        'question_bertidx': devQ_bert_ids,
        'question_bert_spans': devQ_bert_spans
    }

    return meta, result