def preprocess(self, dataset_label): file_name = self.train_file if dataset_label == 'train' else ( self.dev_file if dataset_label == 'dev' else self.test_file) output_file_name = os.path.join( self.spacyDir, self.data_prefix + dataset_label + '-preprocessed.json') print('Preprocessing', dataset_label, 'file:', file_name) print('Loading json...') with open(file_name, 'r') as f: dataset = json.load(f) print('Processing json...') dict1 = ['where', 'when', 'who'] data = [] tot = len(dataset['data']) type1 = type2 = 0 for data_idx in tqdm(range(tot)): datum = dataset['data'][data_idx] context_str = datum['story'] _datum = { 'context': context_str, 'source': datum['source'], 'id': datum['id'] } nlp_context = nlp(pre_proc(context_str)) _datum['annotated_context'] = self.process(nlp_context) _datum['raw_context_offsets'] = self.get_raw_context_offsets( _datum['annotated_context']['word'], context_str) _datum['qas'] = [] assert len(datum['questions']) == len(datum['answers']) for i in range(len(datum['questions'])): question, answer = datum['questions'][i], datum['answers'][i] assert question['turn_id'] == answer['turn_id'] idx = question['turn_id'] _qas = { 'turn_id': idx, 'question': question['input_text'], 'answer': answer['input_text'] } _qas['annotated_question'] = self.process( nlp(pre_proc(question['input_text']))) _qas['annotated_answer'] = self.process( nlp(pre_proc(answer['input_text']))) _qas['raw_answer'] = answer['input_text'] _qas['span_text'] = answer['span_text'] tmp = _qas['raw_answer'] tmp = self.removePunctuation(tmp) if _qas['raw_answer'] in context_str or tmp.lower() in [ "yes", "no", "unknown" ]: type1 += 1 _qas['answer_type'] = "extractive" else: type2 += 1 _qas['answer_type'] = "generative" _qas['answer_span_start'] = answer['span_start'] _qas['answer_span_end'] = answer['span_end'] sign = "" ques = question['input_text'].lower() real_ans = answer['input_text'].lower() real = self.remove_punctual(real_ans) real = real.split() for word in dict1: if word in ques or ques[: 3] == "was" or ques[: 4] == 'were' or ques[: 2] == 'is': sign = "factual" break if len(real) <= 4: sign = "factual" if not sign or real_ans == "no" or real_ans == "yes" or real_ans == 'unknown': sign = "factual" _qas['question_type'] = sign start = answer['span_start'] #rational 范围 end = answer['span_end'] chosen_text = _datum['context'][start:end].lower() while len(chosen_text) > 0 and chosen_text[ 0] in string.whitespace: #判断开头的空白符 \t,\n等6种 chosen_text = chosen_text[1:] start += 1 while len(chosen_text) > 0 and chosen_text[ -1] in string.whitespace: # 判断结尾的空白符 chosen_text = chosen_text[:-1] end -= 1 input_text = _qas['answer'].strip().lower() if input_text in chosen_text: p = chosen_text.find(input_text) # p:input_text的起始值 _qas['answer_span'] = self.find_span( _datum['raw_context_offsets'], start + p, start + p + len(input_text)) else: _qas['answer_span'] = self.find_span_with_gt( _datum['context'], _datum['raw_context_offsets'], input_text) _datum['qas'].append(_qas) data.append(_datum) # build vocabulary if dataset_label == 'train': print('Build vocabulary from training data...') contexts = [_datum['annotated_context']['word'] for _datum in data] qas = [ qa['annotated_question']['word'] + qa['annotated_answer']['word'] for qa in _datum['qas'] for _datum in data ] self.train_vocab = self.build_vocab(contexts, qas) print('Getting word ids...') w2id = {w: i for i, w in enumerate(self.train_vocab)} for _datum in data: _datum['annotated_context']['wordid'] = token2id_sent( _datum['annotated_context']['word'], w2id, unk_id=1, to_lower=False) #new modify, get wordid for qa in _datum['qas']: qa['annotated_question']['wordid'] = token2id_sent( qa['annotated_question']['word'], w2id, unk_id=1, to_lower=False) qa['annotated_answer']['wordid'] = token2id_sent( qa['annotated_answer']['word'], w2id, unk_id=1, to_lower=False) if dataset_label == 'train': # get the condensed dictionary embedding print('Getting embedding matrix for ' + dataset_label) embedding = build_embedding(self.glove_file, self.train_vocab, self.glove_dim) meta = {'vocab': self.train_vocab, 'embedding': embedding.tolist()} meta_file_name = os.path.join(self.spacyDir, dataset_label + '_meta.msgpack') print('Saving meta information to', meta_file_name) with open(meta_file_name, 'wb') as f: # msgpack.dump(meta, f, encoding='utf8') msgpack.dump(meta, f) dataset['data'] = data if dataset_label == 'test': return dataset with open(output_file_name, 'w') as output_file: json.dump(dataset, output_file, sort_keys=True, indent=4) print("The amount of extractive qa is: ", type1) print("The amount of generative qa is: ", type2)
def preprocess(self, dataset_label): file_name = self.train_file if dataset_label == 'train' else (self.dev_file if dataset_label == 'dev' else self.test_file) output_file_name = os.path.join(self.spacyDir, self.data_prefix + dataset_label + '-preprocessed.json') print('Preprocessing', dataset_label, 'file:', file_name) print('Loading json...') with open(file_name, 'r') as f: dataset = json.load(f) print('Processing json...') data = [] tot = len(dataset['data']) for data_idx in tqdm(range(tot)): datum = dataset['data'][data_idx] context_str = datum['story'] _datum = {'context': context_str, 'source': datum['source'], 'id': datum['id'], 'filename': datum['filename']} nlp_context = nlp(pre_proc(context_str)) _datum['annotated_context'] = self.process(nlp_context) _datum['raw_context_offsets'] = self.get_raw_context_offsets(_datum['annotated_context']['word'], context_str) _datum['qas'] = [] assert len(datum['questions']) == len(datum['answers']) additional_answers = {} if 'additional_answers' in datum: for k, answer in datum['additional_answers'].items(): if len(answer) == len(datum['answers']): for ex in answer: idx = ex['turn_id'] if idx not in additional_answers: additional_answers[idx] = [] additional_answers[idx].append(ex['input_text']) # additional_answer is only used to eval, so raw_text is fine for i in range(len(datum['questions'])): question, answer = datum['questions'][i], datum['answers'][i] assert question['turn_id'] == answer['turn_id'] idx = question['turn_id'] _qas = {'turn_id': idx, 'question': question['input_text'], 'answer': answer['input_text']} if idx in additional_answers: _qas['additional_answers'] = additional_answers[idx] _qas['annotated_question'] = self.process(nlp(pre_proc(question['input_text']))) _qas['annotated_answer'] = self.process(nlp(pre_proc(answer['input_text']))) _qas['raw_answer'] = answer['input_text'] _qas['answer_span_start'] = answer['span_start'] _qas['answer_span_end'] = answer['span_end'] start = answer['span_start'] end = answer['span_end'] chosen_text = _datum['context'][start: end].lower() while len(chosen_text) > 0 and chosen_text[0] in string.whitespace: chosen_text = chosen_text[1:] start += 1 while len(chosen_text) > 0 and chosen_text[-1] in string.whitespace: chosen_text = chosen_text[:-1] end -= 1 input_text = _qas['answer'].strip().lower() if input_text in chosen_text: p = chosen_text.find(input_text) _qas['answer_span'] = self.find_span(_datum['raw_context_offsets'], start + p, start + p + len(input_text)) else: _qas['answer_span'] = self.find_span_with_gt(_datum['context'], _datum['raw_context_offsets'], input_text) long_question = '' for j in range(i - 2, i + 1): if j < 0: continue long_question += ' ' + datum['questions'][j]['input_text'] if j < i: long_question += ' ' + datum['answers'][j]['input_text'] long_question = long_question.strip() nlp_long_question = nlp(long_question) _qas['context_features'] = feature_gen(nlp_context, nlp_long_question) _datum['qas'].append(_qas) data.append(_datum) # build vocabulary if dataset_label == 'train': print('Build vocabulary from training data...') contexts = [_datum['annotated_context']['word'] for _datum in data] qas = [qa['annotated_question']['word'] + qa['annotated_answer']['word'] for qa in _datum['qas'] for _datum in data] self.train_vocab = self.build_vocab(contexts, qas) self.train_char_vocab = self.build_char_vocab(self.train_vocab) print('Getting word ids...') w2id = {w: i for i, w in enumerate(self.train_vocab)} c2id = {c: i for i, c in enumerate(self.train_char_vocab)} for _datum in data: _datum['annotated_context']['wordid'] = token2id_sent(_datum['annotated_context']['word'], w2id, unk_id = 1, to_lower = False) _datum['annotated_context']['charid'] = char2id_sent(_datum['annotated_context']['word'], c2id, unk_id = 1, to_lower = False) for qa in _datum['qas']: qa['annotated_question']['wordid'] = token2id_sent(qa['annotated_question']['word'], w2id, unk_id = 1, to_lower = False) qa['annotated_question']['charid'] = char2id_sent(qa['annotated_question']['word'], c2id, unk_id = 1, to_lower = False) qa['annotated_answer']['wordid'] = token2id_sent(qa['annotated_answer']['word'], w2id, unk_id = 1, to_lower = False) qa['annotated_answer']['charid'] = char2id_sent(qa['annotated_answer']['word'], c2id, unk_id = 1, to_lower = False) if dataset_label == 'train': # get the condensed dictionary embedding print('Getting embedding matrix for ' + dataset_label) embedding = build_embedding(self.glove_file, self.train_vocab, self.glove_dim) meta = {'vocab': self.train_vocab, 'char_vocab': self.train_char_vocab, 'embedding': embedding.tolist()} meta_file_name = os.path.join(self.spacyDir, dataset_label + '_meta.msgpack') print('Saving meta information to', meta_file_name) with open(meta_file_name, 'wb') as f: msgpack.dump(meta, f, encoding='utf8') dataset['data'] = data if dataset_label == 'test': return dataset with open(output_file_name, 'w') as output_file: json.dump(dataset, output_file, sort_keys=True, indent=4)
def preprocess(self, dataset_label): file_name = self.train_file if dataset_label == 'train' else ( self.dev_file if dataset_label == 'dev' else self.test_file) output_file_name = os.path.join( self.spacyDir, self.data_prefix + dataset_label + '-preprocessed.json') print('Preprocessing', dataset_label, 'file:', file_name) print('Loading json...') with open(file_name, 'r') as f: dataset = json.load(f) print('Processing json...') count = 0 data = [] tot = len(dataset['data']) type1 = type2 = 0 for data_idx in tqdm(range(tot)): datum = dataset['data'][data_idx]['paragraphs'][0] context_str = datum['context'] _datum = { 'context': context_str, 'title': dataset['data'][data_idx]['title'], 'id': data_idx } nlp_context = nlp(pre_proc(context_str)) _datum['annotated_context'] = self.process(nlp_context) _datum['raw_context_offsets'] = self.get_raw_context_offsets( _datum['annotated_context']['word'], context_str) _datum['qas'] = [] # assert len(datum['qas']['questions']) == len(datum['answers']) for i in range(len(datum['qas'])): question, answer = datum['qas'][i]['question'], datum['qas'][ i]['answers'][0]['text'] # assert question['turn_id'] == answer['turn_id'] count += 1 idx = datum['qas'][i]['id'] _qas = {'turn_id': idx, 'question': question, 'answer': answer} _qas['annotated_question'] = self.process( nlp(pre_proc(question))) _qas['annotated_answer'] = self.process(nlp(pre_proc(answer))) _qas['raw_answer'] = answer _qas['answer_type'] = "extractive" _qas['answer_span_start'] = datum['qas'][i]['answers'][0][ 'answer_start'] _qas['answer_span_end'] = _qas['answer_span_start'] + len( answer) + 1 _qas['followup'] = datum['qas'][i]['followup'] _qas['yesno'] = datum['qas'][i]['yesno'] tmp = _qas['raw_answer'] tmp = self.removePunctuation(tmp) if _qas['raw_answer'] in context_str or tmp.lower() in [ "yes", "no", "unknown" ]: type1 += 1 _qas['answer_type'] = "extractive" else: type2 += 1 _qas['answer_type'] = "generative" start = _qas['answer_span_start'] # rational 范围 end = _qas['answer_span_end'] chosen_text = _datum['context'][start:end].lower() while len(chosen_text) > 0 and chosen_text[ 0] in string.whitespace: # 判断开头的空白符 \t,\n等6种 chosen_text = chosen_text[1:] start += 1 while len(chosen_text) > 0 and chosen_text[ -1] in string.whitespace: # 判断结尾的空白符 chosen_text = chosen_text[:-1] end -= 1 input_text = _qas['answer'].strip().lower() if input_text in chosen_text: p = chosen_text.find(input_text) # p:input_text的起始值 _qas['answer_span'] = self.find_span( _datum['raw_context_offsets'], start + p, start + p + len(input_text)) else: _qas['answer_span'] = self.find_span_with_gt( _datum['context'], _datum['raw_context_offsets'], input_text) _datum['qas'].append(_qas) data.append(_datum) # build vocabulary if dataset_label == 'train': print('Build vocabulary from training data...') contexts = [_datum['annotated_context']['word'] for _datum in data] qas = [ qa['annotated_question']['word'] + qa['annotated_answer']['word'] for qa in _datum['qas'] for _datum in data ] # self.train_vocab = self.build_vocab(contexts, qas) # print('Getting word ids...') # w2id = {w: i for i, w in enumerate(self.train_vocab)} # for _datum in data: # _datum['annotated_context']['wordid'] = token2id_sent(_datum['annotated_context']['word'], w2id, unk_id=1, # to_lower=False) # # new modify, get wordid # for qa in _datum['qas']: # qa['annotated_question']['wordid'] = token2id_sent(qa['annotated_question']['word'], w2id, unk_id=1, # to_lower=False) # qa['annotated_answer']['wordid'] = token2id_sent(qa['annotated_answer']['word'], w2id, unk_id=1, # to_lower=False) # if dataset_label == 'train': # # get the condensed dictionary embedding # print('Getting embedding matrix for ' + dataset_label) # embedding = build_embedding(self.glove_file, self.train_vocab, self.glove_dim) # meta = {'vocab': self.train_vocab, 'embedding': embedding.tolist()} # meta_file_name = os.path.join(self.spacyDir, dataset_label + '_meta.msgpack') # print('Saving meta information to', meta_file_name) # with open(meta_file_name, 'wb') as f: # msgpack.dump(meta, f, encoding='utf8') dataset['data'] = data if dataset_label == 'test': return dataset with open(output_file_name, 'w') as output_file: json.dump(dataset, output_file, sort_keys=True, indent=4) print("The amount of extractive qa is: ", type1) print("The amount of generative qa is: ", type2) print("The amount of qas is: ", count)