trC_iter = (pre_proc(c) for c in train_context) trQ_iter = (pre_proc(q) for q in train.question) trC_docs = [doc for doc in nlp.pipe(trC_iter, batch_size=64, n_threads=args.threads)] trQ_docs = [doc for doc in nlp.pipe(trQ_iter, batch_size=64, n_threads=args.threads)] # tokens trC_tokens = [[re.sub(r'_', ' ', normalize_text(w.text)) for w in doc] for doc in trC_docs] trQ_tokens = [[re.sub(r'_', ' ', normalize_text(w.text)) for w in doc] for doc in trQ_docs] trC_unnorm_tokens = [[re.sub(r'_', ' ', w.text) for w in doc] for doc in trC_docs] log.info('All tokens for training are obtained.') train_context_span = [get_context_span(a, b) for a, b in zip(train_context, trC_unnorm_tokens)] ans_st_token_ls, ans_end_token_ls = [], [] for ans_st, ans_end, idx in zip(train.answer_start, train.answer_end, train.context_idx): ans_st_token, ans_end_token = find_answer_span(train_context_span[idx], ans_st, ans_end) ans_st_token_ls.append(ans_st_token) ans_end_token_ls.append(ans_end_token) ration_st_token_ls, ration_end_token_ls = [], [] for ration_st, ration_end, idx in zip(train.rationale_start, train.rationale_end, train.context_idx): ration_st_token, ration_end_token = find_answer_span(train_context_span[idx], ration_st, ration_end) ration_st_token_ls.append(ration_st_token) ration_end_token_ls.append(ration_end_token) train['answer_start_token'], train['answer_end_token'] = ans_st_token_ls, ans_end_token_ls train['rationale_start_token'], train['rationale_end_token'] = ration_st_token_ls, ration_end_token_ls initial_len = len(train) train.dropna(inplace=True) # modify self DataFrame log.info('drop {0}/{1} inconsistent samples.'.format(initial_len - len(train), initial_len))
def preprocess_data(dev_file): dev, dev_context = flatten_json(dev_file, proc_dev) dev = pd.DataFrame(dev, columns=[ 'context_idx', 'question', 'answer', 'answer_start', 'answer_end', 'answer_choice', 'all_answer', 'qid' ]) print('dev json data flattened.') devC_iter = (pre_proc(c) for c in dev_context) devQ_iter = (pre_proc(q) for q in dev.question) nlp = spacy.load('en', disable=['parser']) devC_docs = [ doc for doc in nlp.pipe( devC_iter, batch_size=64, n_threads=multiprocessing.cpu_count()) ] devQ_docs = [ doc for doc in nlp.pipe( devQ_iter, batch_size=64, n_threads=multiprocessing.cpu_count()) ] del nlp devC_tokens = [[normalize_text(w.text) for w in doc] for doc in devC_docs] devQ_tokens = [[normalize_text(w.text) for w in doc] for doc in devQ_docs] devC_unnorm_tokens = [[w.text for w in doc] for doc in devC_docs] print('All tokens for dev are obtained.') dev_context_span = [ get_context_span(a, b) for a, b in zip(dev_context, devC_unnorm_tokens) ] print('context span for dev is generated.') ans_st_token_ls, ans_end_token_ls = [], [] for ans_st, ans_end, idx in zip(dev.answer_start, dev.answer_end, dev.context_idx): ans_st_token, ans_end_token = find_answer_span(dev_context_span[idx], ans_st, ans_end) ans_st_token_ls.append(ans_st_token) ans_end_token_ls.append(ans_end_token) dev['answer_start_token'], dev[ 'answer_end_token'] = ans_st_token_ls, ans_end_token_ls initial_len = len(dev) dev.dropna(inplace=True) # modify self DataFrame print('drop {0}/{1} inconsistent samples.'.format(initial_len - len(dev), initial_len)) print('answer span for dev is generated.') devC_tags, devC_ents, devC_features = feature_gen(devC_docs, dev.context_idx, devQ_docs, False) print('features for dev is generated: {}, {}, {}'.format( len(devC_tags), len(devC_ents), len(devC_features))) dev_vocab = build_dev_vocab( devQ_tokens, devC_tokens) # tr_vocab is a subset of dev_vocab devC_ids = token2id(devC_tokens, dev_vocab, unk_id=1) devQ_ids = token2id(devQ_tokens, dev_vocab, unk_id=1) devQ_tokens = [["<S>"] + doc + ["</S>"] for doc in devQ_tokens] devQ_ids = [[2] + qsent + [3] for qsent in devQ_ids] # BERT stuff devC_bert_tokens = tokenize(devC_tokens) devC_bert_ids = [bert_tokens_to_ids(x) for x in devC_bert_tokens] devQ_bert_tokens = tokenize(devQ_tokens) devQ_bert_ids = [bert_tokens_to_ids(x) for x in devQ_bert_tokens] devC_bert_spans = [ calc_bert_spans(b, t) for b, t in zip(devC_bert_tokens, devC_tokens) ] devQ_bert_spans = [ calc_bert_spans(b, t) for b, t in zip(devQ_bert_tokens, devQ_tokens) ] vocab_tag = pickle.load(open('./vocab_tag.pkl', 'rb')) vocab_ent = pickle.load(open('./vocab_ent.pkl', 'rb')) devC_tag_ids = token2id(devC_tags, vocab_tag) # vocab_tag same as training # entities devC_ent_ids = token2id(devC_ents, vocab_ent, unk_id=0) # vocab_ent same as training print('vocabulary for dev is built.') dev_embedding = build_embedding('glove/glove.840B.300d.txt', dev_vocab, 300) meta = {'vocab': dev_vocab, 'embedding': dev_embedding.tolist()} prev_CID, first_question = -1, [] for i, CID in enumerate(dev.context_idx): if not (CID == prev_CID): first_question.append(i) prev_CID = CID result = { 'qids': dev.qid.tolist(), 'question_ids': devQ_ids, 'context_ids': devC_ids, 'context_features': devC_features, # exact match, tf 'context_tags': devC_tag_ids, # POS tagging 'context_ents': devC_ent_ids, # Entity recognition 'context': dev_context, 'context_span': dev_context_span, '1st_question': first_question, 'question_CID': dev.context_idx.tolist(), 'question': dev.question.tolist(), 'answer': dev.answer.tolist(), 'answer_start': dev.answer_start_token.tolist(), 'answer_end': dev.answer_end_token.tolist(), 'answer_choice': dev.answer_choice.tolist(), 'all_answer': dev.all_answer.tolist(), 'context_tokenized': devC_tokens, 'question_tokenized': devQ_tokens, 'context_bertidx': devC_bert_ids, 'context_bert_spans': devC_bert_spans, 'question_bertidx': devQ_bert_ids, 'question_bert_spans': devQ_bert_spans } return meta, result
trQ_tokens = [[normalize_text(w.text) for w in doc] for doc in trQ_docs] trC_unnorm_tokens = [[w.text for w in doc] for doc in trC_docs] log.info('All tokens for training are obtained.') train_context_span = [ get_context_span(a, b) for a, b in zip(train_context, trC_unnorm_tokens) ] #get_context_span()#各个context_token这个list中的token对应于context这个字符串中的,单词首尾的下标 #train_context_span 这个list中的每个元素是[[],[],[]]一句话作为list,这句话里面的每个单词再是一个二元素的list ans_st_token_ls, ans_end_token_ls = [], [] for ans_st, ans_end, idx in zip( train.answer_start, train.answer_end, train.context_idx): #ans_st,ans_end也是char base的 ans_st_token, ans_end_token = find_answer_span( train_context_span[idx], ans_st, ans_end) #这个就是我关心的关键,如何将char_base的index,转化为token_base的,有可能返回-1,-1哦 #这里的逻辑很简单,真正神奇的事情已经发生过了,就是白天看的 #在free_text_to_span()函数中两个函数的配合使用 #其一len_preserved_normalize_answer()在去掉每一个标点或者不想要的字符的时候,用空白符填补,确保返回的字符串的长度不变,并且里面的每个单词所处的位置不变 #其二split_with_span()在这个里面使用了re.finditer()函数,返回了每个匹配的start和end #zip(*[(m.group(0), (m.start(), m.end()-1)) for m in re.finditer(r'\S+', s)])#\S匹配非空白字符 #返回的东西用zip(*返回的东西)裹一下,这样外面就得到了list[str],list[span()] ans_st_token_ls.append(ans_st_token) ans_end_token_ls.append(ans_end_token) ration_st_token_ls, ration_end_token_ls = [], [] for ration_st, ration_end, idx in zip( train.rationale_start, train.rationale_end, train.context_idx ): #与得到答案的首尾token base的首尾index 的方法一样,得到rationale的token base的首尾span ration_st_token, ration_end_token = find_answer_span(
def build_test_data(opt, dev_file, vocab): # random.seed(args.seed) # np.random.seed(args.seed) # logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG, # datefmt='%m/%d/%Y %I:%M:%S') log = logging.getLogger(__name__) # tags vocab_tag = [''] + list(nlp.tagger.labels) # entities # log.info('start data preparing... (using {} threads)'.format(args.threads)) # glove_vocab = load_glove_vocab(wv_file, wv_dim) # return a "set" of vocabulary # log.info('glove loaded.') def proc_dev(ith, article): rows = [] context = article['story'] for j, (question, answers) in enumerate( zip(article['questions'], article['answers'])): gold_answer = answers['input_text'] span_answer = answers['span_text'] answer, char_i, char_j = free_text_to_span(gold_answer, span_answer) answer_choice = 0 if answer == '__NA__' else \ 1 if answer == '__YES__' else \ 2 if answer == '__NO__' else \ 3 # Not a yes/no question if answer_choice == 3: answer_start = answers['span_start'] + char_i answer_end = answers['span_start'] + char_j else: answer_start, answer_end = -1, -1 rationale = answers['span_text'] rationale_start = answers['span_start'] rationale_end = answers['span_end'] q_text = question['input_text'] if j > 0: q_text = article['answers'][j - 1]['input_text'] + " // " + q_text rows.append( (ith, q_text, answer, answer_start, answer_end, rationale, rationale_start, rationale_end, answer_choice)) return rows, context dev, dev_context = flatten_json(dev_file, proc_dev) dev = pd.DataFrame(dev, columns=[ 'context_idx', 'question', 'answer', 'answer_start', 'answer_end', 'rationale', 'rationale_start', 'rationale_end', 'answer_choice' ]) # log.info('dev json data flattened.') # print(dev) devC_iter = (pre_proc(c) for c in dev_context) devQ_iter = (pre_proc(q) for q in dev.question) devC_docs = [ doc for doc in nlp.pipe(devC_iter, batch_size=64, n_threads=args.threads) ] devQ_docs = [ doc for doc in nlp.pipe(devQ_iter, batch_size=64, n_threads=args.threads) ] # tokens devC_tokens = [[re.sub(r'_', ' ', normalize_text(w.text)) for w in doc] for doc in devC_docs] devQ_tokens = [[re.sub(r'_', ' ', normalize_text(w.text)) for w in doc] for doc in devQ_docs] devC_unnorm_tokens = [[re.sub(r'_', ' ', w.text) for w in doc] for doc in devC_docs] # log.info('All tokens for dev are obtained.') dev_context_span = [ get_context_span(a, b) for a, b in zip(dev_context, devC_unnorm_tokens) ] # log.info('context span for dev is generated.') ans_st_token_ls, ans_end_token_ls = [], [] for ans_st, ans_end, idx in zip(dev.answer_start, dev.answer_end, dev.context_idx): ans_st_token, ans_end_token = find_answer_span(dev_context_span[idx], ans_st, ans_end) ans_st_token_ls.append(ans_st_token) ans_end_token_ls.append(ans_end_token) ration_st_token_ls, ration_end_token_ls = [], [] for ration_st, ration_end, idx in zip(dev.rationale_start, dev.rationale_end, dev.context_idx): ration_st_token, ration_end_token = find_answer_span( dev_context_span[idx], ration_st, ration_end) ration_st_token_ls.append(ration_st_token) ration_end_token_ls.append(ration_end_token) dev['answer_start_token'], dev[ 'answer_end_token'] = ans_st_token_ls, ans_end_token_ls dev['rationale_start_token'], dev[ 'rationale_end_token'] = ration_st_token_ls, ration_end_token_ls initial_len = len(dev) dev.dropna(inplace=True) # modify self DataFrame # log.info('drop {0}/{1} inconsistent samples.'.format(initial_len - len(dev), initial_len)) # log.info('answer span for dev is generated.') # features devC_tags, devC_ents, devC_features = feature_gen(devC_docs, dev.context_idx, devQ_docs, args.no_match) # log.info('features for dev is generated: {}, {}, {}'.format(len(devC_tags), len(devC_ents), len(devC_features))) vocab_ent = list(set([ent for sent in devC_ents for ent in sent])) # vocab dev_vocab = vocab # tr_vocab is a subset of dev_vocab devC_ids = token2id(devC_tokens, dev_vocab, unk_id=1) devQ_ids = token2id(devQ_tokens, dev_vocab, unk_id=1) devQ_tokens = [["<S>"] + doc + ["</S>"] for doc in devQ_tokens] devQ_ids = [[2] + qsent + [3] for qsent in devQ_ids] # print(devQ_ids[:10]) # tags devC_tag_ids = token2id(devC_tags, vocab_tag) # vocab_tag same as training # entities devC_ent_ids = token2id(devC_ents, vocab_ent, unk_id=0) # vocab_ent same as training # log.info('vocabulary for dev is built.') prev_CID, first_question = -1, [] for i, CID in enumerate(dev.context_idx): if not (CID == prev_CID): first_question.append(i) prev_CID = CID data = { 'question_ids': devQ_ids, 'context_ids': devC_ids, 'context_features': devC_features, # exact match, tf 'context_tags': devC_tag_ids, # POS tagging 'context_ents': devC_ent_ids, # Entity recognition 'context': dev_context, 'context_span': dev_context_span, '1st_question': first_question, 'question_CID': dev.context_idx.tolist(), 'question': dev.question.tolist(), 'answer': dev.answer.tolist(), 'answer_start': dev.answer_start_token.tolist(), 'answer_end': dev.answer_end_token.tolist(), 'rationale_start': dev.rationale_start_token.tolist(), 'rationale_end': dev.rationale_end_token.tolist(), 'answer_choice': dev.answer_choice.tolist(), 'context_tokenized': devC_tokens, 'question_tokenized': devQ_tokens } # with open('CoQA/test_data.msgpack', 'wb') as f: # msgpack.dump(result, f) # log.info('saved test to disk.') dev = { 'context': list( zip(data['context_ids'], data['context_tags'], data['context_ents'], data['context'], data['context_span'], data['1st_question'], data['context_tokenized'])), 'qa': list( zip(data['question_CID'], data['question_ids'], data['context_features'], data['answer_start'], data['answer_end'], data['rationale_start'], data['rationale_end'], data['answer_choice'], data['question'], data['answer'], data['question_tokenized'])) } print("test_data built") # embedding = torch.Tensor(meta['embedding']) return dev