trQ_ids = token2id(trQ_tokens, tr_vocab, unk_id=1) trQ_tokens = [["<S>"] + doc + ["</S>"] for doc in trQ_tokens] trQ_ids = [[2] + qsent + [3] for qsent in trQ_ids] # print(trQ_ids[:10]) # tags vocab_tag = [''] + list(nlp.tagger.labels) trC_tag_ids = token2id(trC_tags, vocab_tag) # entities vocab_ent = list(set([ent for sent in trC_ents for ent in sent])) trC_ent_ids = token2id(trC_ents, vocab_ent, unk_id=0) log.info('Found {} POS tags.'.format(len(vocab_tag))) log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent)) log.info('vocabulary for training is built.') tr_embedding = build_embedding(wv_file, tr_vocab, wv_dim) log.info('got embedding matrix for training.') meta = { 'vocab': tr_vocab, 'embedding': tr_embedding.tolist() } with open('CoQA/train_meta.msgpack', 'wb') as f: msgpack.dump(meta, f) prev_CID, first_question = -1, [] for i, CID in enumerate(train.context_idx): if not (CID == prev_CID): first_question.append(i) prev_CID = CID
# tokens testC_tokens = [[normalize_text(w.text) for w in doc] for doc in testC_docs] testQ_tokens = [[normalize_text(w.text) for w in doc] for doc in testQ_docs] test_vocab = build_test_vocab( testQ_tokens, testC_tokens) # tr_vocab is a subset of test_vocab testC_ids = token2id(testC_tokens, test_vocab, unk_id=1) testQ_ids = token2id(testQ_tokens, test_vocab, unk_id=1) # tags vocab_tag = list(nlp.tagger.tag_names) testC_tag_ids = token2id(testC_tags, vocab_tag) # vocab_tag same as training # entities vocab_ent = [''] + nlp.entity.cfg[u'actions']['1'] testC_ent_ids = token2id(testC_ents, vocab_ent) # vocab_ent same as training log.info('vocabulary for test is built.') test_embedding = build_embedding(wv_file, test_vocab, wv_dim) # tr_embedding is a submatrix of test_embedding log.info('got embedding matrix for test.') # don't store row name in csv test.to_csv('SQuAD/test.csv', index=False, encoding='utf8') meta = {'vocab': test_vocab, 'embedding': test_embedding.tolist()} with open('SQuAD/test_meta.msgpack', 'wb') as f: msgpack.dump(meta, f) result = { 'question_ids': testQ_ids, 'context_ids': testC_ids, 'context_features': testC_features, # exact match, tf 'context_tags': testC_tag_ids, # POS tagging
trC_tags, vocab_tag) #Context中的每个单词对应到id之后,没有单词是有tag的,这个tag也对应到tag_id上 # entities vocab_ent = list(set([ent for sent in trC_ents for ent in sent])) #在context中所出现的所有的命名实体类型全部归入到list中 # print("vocab_ent = ",vocab_ent) # exit(789) trC_ent_ids = token2id(trC_ents, vocab_ent, unk_id=0) #这里怎么还会需要unk_id log.info('Found {} POS tags.'.format(len(vocab_tag))) log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent)) log.info('vocabulary for training is built.') # exit(89) # 接下来该看怎么获得向量的了 tr_embedding = build_embedding(wv_file, tr_vocab, wv_dim) #tr_vocab实际上是一个list ,返回的是一个numpy的array log.info('got embedding matrix for training.') meta = {'vocab': tr_vocab, 'embedding': tr_embedding.tolist()} with open('CoQA/train_meta.msgpack', 'wb') as f: msgpack.dump(meta, f) prev_CID, first_question = -1, [] for i, CID in enumerate(train.context_idx): if not (CID == prev_CID): first_question.append(i) prev_CID = CID result = { 'question_ids': trQ_ids, 'context_ids': trC_ids,
def preprocess_eval_data(filename, output_msgpack): EvalData = process_jsonlines(filename) filename = os.path.basename(filename) log.info(filename + ' flattened.') EvalDataP_iter = (pre_proc(p) for p in EvalData.P) EvalDataH_iter = (pre_proc(h) for h in EvalData.H) EvalDataP_docs = [ doc for doc in nlp.pipe( EvalDataP_iter, batch_size=64, n_threads=args.threads) ] EvalDataH_docs = [ doc for doc in nlp.pipe( EvalDataH_iter, batch_size=64, n_threads=args.threads) ] # tokens EvalDataP_tokens = [[normalize_text(w.text) for w in doc] for doc in EvalDataP_docs] EvalDataH_tokens = [[normalize_text(w.text) for w in doc] for doc in EvalDataH_docs] log.info('All tokens for ' + filename + ' are obtained.') # features EvalDataP_tags, EvalDataP_ents, EvalDataP_features = feature_gen( EvalDataP_docs, EvalDataH_docs) EvalDataH_tags, EvalDataH_ents, EvalDataH_features = feature_gen( EvalDataH_docs, EvalDataP_docs) log.info('features for ' + filename + ' is generated.') def build_EvalData_vocab(A, B): # most vocabulary comes from tr_vocab existing_vocab = set(tr_vocab) new_vocab = list( set([ w for doc in A + B for w in doc if w not in existing_vocab and w in glove_vocab ])) vocab = tr_vocab + new_vocab log.info('train vocab {0}, total vocab {1}'.format( len(tr_vocab), len(vocab))) return vocab # vocab EvalData_vocab = build_EvalData_vocab( EvalDataP_tokens, EvalDataH_tokens) # tr_vocab is a subset of EvalData_vocab EvalDataP_ids = token2id(EvalDataP_tokens, EvalData_vocab, unk_id=1) EvalDataH_ids = token2id(EvalDataH_tokens, EvalData_vocab, unk_id=1) # tags EvalDataP_tag_ids = token2id(EvalDataP_tags, vocab_tag) EvalDataH_tag_ids = token2id(EvalDataH_tags, vocab_tag) # vocab_tag same as training # entities EvalDataP_ent_ids = token2id(EvalDataP_ents, vocab_ent) # vocab_ent same as training EvalDataH_ent_ids = token2id(EvalDataH_ents, vocab_ent) # vocab_ent same as training log.info('vocabulary for ' + filename + ' is built.') EvalData_embedding = build_embedding( wv_file, EvalData_vocab, wv_dim) # tr_embedding is a submatrix of EvalData_embedding log.info('got embedding matrix for ' + filename) result = { 'premise_ids': EvalDataP_ids, 'premise_features': EvalDataP_features, # exact match, tf 'premise_tags': EvalDataP_tag_ids, # POS tagging 'premise_ents': EvalDataP_ent_ids, # Entity recognition 'hypothesis_ids': EvalDataH_ids, 'hypothesis_features': EvalDataH_features, # exact match, tf 'hypothesis_tags': EvalDataH_tag_ids, # POS tagging 'hypothesis_ents': EvalDataH_ent_ids, # Entity recognition 'vocab': EvalData_vocab, 'embedding': EvalData_embedding.tolist(), 'answers': EvalData.label } with open(output_msgpack, 'wb') as f: msgpack.dump(result, f) log.info('saved ' + output_msgpack + ' to disk.')
def preprocess_data(dev_file): dev, dev_context = flatten_json(dev_file, proc_dev) dev = pd.DataFrame(dev, columns=[ 'context_idx', 'question', 'answer', 'answer_start', 'answer_end', 'answer_choice', 'all_answer', 'qid' ]) print('dev json data flattened.') devC_iter = (pre_proc(c) for c in dev_context) devQ_iter = (pre_proc(q) for q in dev.question) nlp = spacy.load('en', disable=['parser']) devC_docs = [ doc for doc in nlp.pipe( devC_iter, batch_size=64, n_threads=multiprocessing.cpu_count()) ] devQ_docs = [ doc for doc in nlp.pipe( devQ_iter, batch_size=64, n_threads=multiprocessing.cpu_count()) ] del nlp devC_tokens = [[normalize_text(w.text) for w in doc] for doc in devC_docs] devQ_tokens = [[normalize_text(w.text) for w in doc] for doc in devQ_docs] devC_unnorm_tokens = [[w.text for w in doc] for doc in devC_docs] print('All tokens for dev are obtained.') dev_context_span = [ get_context_span(a, b) for a, b in zip(dev_context, devC_unnorm_tokens) ] print('context span for dev is generated.') ans_st_token_ls, ans_end_token_ls = [], [] for ans_st, ans_end, idx in zip(dev.answer_start, dev.answer_end, dev.context_idx): ans_st_token, ans_end_token = find_answer_span(dev_context_span[idx], ans_st, ans_end) ans_st_token_ls.append(ans_st_token) ans_end_token_ls.append(ans_end_token) dev['answer_start_token'], dev[ 'answer_end_token'] = ans_st_token_ls, ans_end_token_ls initial_len = len(dev) dev.dropna(inplace=True) # modify self DataFrame print('drop {0}/{1} inconsistent samples.'.format(initial_len - len(dev), initial_len)) print('answer span for dev is generated.') devC_tags, devC_ents, devC_features = feature_gen(devC_docs, dev.context_idx, devQ_docs, False) print('features for dev is generated: {}, {}, {}'.format( len(devC_tags), len(devC_ents), len(devC_features))) dev_vocab = build_dev_vocab( devQ_tokens, devC_tokens) # tr_vocab is a subset of dev_vocab devC_ids = token2id(devC_tokens, dev_vocab, unk_id=1) devQ_ids = token2id(devQ_tokens, dev_vocab, unk_id=1) devQ_tokens = [["<S>"] + doc + ["</S>"] for doc in devQ_tokens] devQ_ids = [[2] + qsent + [3] for qsent in devQ_ids] # BERT stuff devC_bert_tokens = tokenize(devC_tokens) devC_bert_ids = [bert_tokens_to_ids(x) for x in devC_bert_tokens] devQ_bert_tokens = tokenize(devQ_tokens) devQ_bert_ids = [bert_tokens_to_ids(x) for x in devQ_bert_tokens] devC_bert_spans = [ calc_bert_spans(b, t) for b, t in zip(devC_bert_tokens, devC_tokens) ] devQ_bert_spans = [ calc_bert_spans(b, t) for b, t in zip(devQ_bert_tokens, devQ_tokens) ] vocab_tag = pickle.load(open('./vocab_tag.pkl', 'rb')) vocab_ent = pickle.load(open('./vocab_ent.pkl', 'rb')) devC_tag_ids = token2id(devC_tags, vocab_tag) # vocab_tag same as training # entities devC_ent_ids = token2id(devC_ents, vocab_ent, unk_id=0) # vocab_ent same as training print('vocabulary for dev is built.') dev_embedding = build_embedding('glove/glove.840B.300d.txt', dev_vocab, 300) meta = {'vocab': dev_vocab, 'embedding': dev_embedding.tolist()} prev_CID, first_question = -1, [] for i, CID in enumerate(dev.context_idx): if not (CID == prev_CID): first_question.append(i) prev_CID = CID result = { 'qids': dev.qid.tolist(), 'question_ids': devQ_ids, 'context_ids': devC_ids, 'context_features': devC_features, # exact match, tf 'context_tags': devC_tag_ids, # POS tagging 'context_ents': devC_ent_ids, # Entity recognition 'context': dev_context, 'context_span': dev_context_span, '1st_question': first_question, 'question_CID': dev.context_idx.tolist(), 'question': dev.question.tolist(), 'answer': dev.answer.tolist(), 'answer_start': dev.answer_start_token.tolist(), 'answer_end': dev.answer_end_token.tolist(), 'answer_choice': dev.answer_choice.tolist(), 'all_answer': dev.all_answer.tolist(), 'context_tokenized': devC_tokens, 'question_tokenized': devQ_tokens, 'context_bertidx': devC_bert_ids, 'context_bert_spans': devC_bert_spans, 'question_bertidx': devQ_bert_ids, 'question_bert_spans': devQ_bert_spans } return meta, result