def split_data(self, filename): self.load_data(filename) sub_dir = filename.split('-')[0] # create a subdirectory for Train and Dev data if not os.path.exists(os.path.join(self.data_dir, sub_dir)): os.makedirs(os.path.join(self.data_dir, sub_dir)) with open(os.path.join(self.data_dir, sub_dir, sub_dir + '.context'), 'w', encoding="utf-8") as context_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.question'), 'w', encoding="utf-8") as question_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.answer'), 'w', encoding="utf-8") as answer_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.labels'), 'w', encoding="utf-8") as labels_file: # loop over the data for article_id in tqdm.tqdm(range(len(self.data['data']))): list_paragraphs = self.data['data'][article_id]['paragraphs'] # loop over the paragraphs for paragraph in list_paragraphs: context = paragraph['context'] context = clean_text(context) context_tokens = [w for w in word_tokenize(context) if w] spans = convert_idx(context, context_tokens) qas = paragraph['qas'] # loop over Q/A for qa in qas: question = qa['question'] question = clean_text(question) question_tokens = [w for w in word_tokenize(question) if w] if sub_dir == "train": # select only one ground truth, the top answer, if any answer answer_ids = 1 if qa['answers'] else 0 else: answer_ids = len(qa['answers']) labels = [] if answer_ids: for answer_id in range(answer_ids): answer = qa['answers'][answer_id]['text'] answer = clean_text(answer) answer_tokens = [w for w in word_tokenize(answer) if w] answer_start = qa['answers'][answer_id]['answer_start'] answer_stop = answer_start + len(answer) answer_span = [] for idx, span in enumerate(spans): if not (answer_stop <= span[0] or answer_start >= span[1]): answer_span.append(idx) if not answer_span: continue labels.append(str(answer_span[0]) + ' ' + str(answer_span[-1])) # write to file context_file.write(' '.join([token for token in context_tokens]) + '\n') question_file.write(' '.join([token for token in question_tokens]) + '\n') answer_file.write(' '.join([token for token in answer_tokens]) + '\n') labels_file.write("|".join(labels) + "\n")
def split_data(self, filename): self.load_data(filename) sub_dir = filename.split('-')[0] # create a subdirectory for Train and Dev data if not os.path.exists(os.path.join(self.data_dir, sub_dir)): os.makedirs(os.path.join(self.data_dir, sub_dir)) with open(os.path.join(self.data_dir, sub_dir, sub_dir + '.context'), 'w', encoding="utf-8") as context_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.sentence'), 'w', encoding="utf-8") as sentence_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.question'), 'w', encoding="utf-8") as question_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + '.answer'), 'w', encoding="utf-8") as answer_file: # loop over the data for article_id in tqdm.tqdm(range(len(self.data['data']))): list_paragraphs = self.data['data'][article_id]['paragraphs'] # loop over the paragraphs for paragraph in list_paragraphs: context = paragraph['context'] context = clean_text(context) context_tokens = word_tokenize(context) if config.paragraph and ( len(context_tokens) < config.min_len_context or len(context_tokens) > config.max_len_context): continue context_sentences = sent_tokenize(context) spans = convert_idx(context, context_tokens) num_tokens = 0 first_token_sentence = [] for sentence in context_sentences: first_token_sentence.append(num_tokens) num_tokens += len(sentence) qas = paragraph['qas'] # loop over Q/A for qa in qas: question = qa['question'] question = clean_text(question) question_tokens = word_tokenize(question) if question_tokens[-1] != "?" or len( question_tokens ) < config.min_len_question or len( question_tokens) > config.max_len_question: continue if sub_dir == "train": # select only one ground truth, the top answer, if any answer answer_ids = 1 if qa['answers'] else 0 else: answer_ids = len(qa['answers']) if answer_ids: for answer_id in range(answer_ids): answer = qa['answers'][answer_id]['text'] answer = clean_text(answer) answer_tokens = word_tokenize(answer) answer_start = qa['answers'][answer_id][ 'answer_start'] answer_stop = answer_start + len(answer) # Getting spans of the answer in the context answer_span = [] for idx, span in enumerate(spans): if not (answer_stop <= span[0] or answer_start >= span[1]): answer_span.append(idx) if not answer_span: continue # Getting the sentence where we have the answer sentence_tokens = [] for idx, start in enumerate( first_token_sentence): if answer_span[0] >= start: sentence_tokens = context_sentences[ idx] answer_sentence_span = [ span - start for span in answer_span ] else: break if not sentence_tokens: print("Sentence cannot be found") raise Exception() # write to file context_file.write(" ".join([ token + u"│" + "1" if idx in answer_span else token + u"│" + "0" for idx, token in enumerate(context_tokens) ]) + "\n") sentence_file.write(" ".join([ token + u"│" + "1" if idx in answer_sentence_span else token + u"│" + "0" for idx, token in enumerate(sentence_tokens) ]) + "\n") question_file.write( " ".join([token for token in question_tokens]) + "\n") answer_file.write( " ".join([token for token in answer_tokens]) + "\n")
def split_data(self, filename): self.load_data(filename) envs = ["train", "dev"] for sub_dir in envs: # create a subdirectory for Train and Dev data if not os.path.exists(os.path.join(self.data_dir, sub_dir)): os.makedirs(os.path.join(self.data_dir, sub_dir)) with open(os.path.join(self.data_dir, sub_dir, sub_dir + ".context"), "w", encoding="utf-8") as context_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + ".sentence"), "w", encoding="utf-8") as sentence_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + ".question"), "w", encoding="utf-8") as question_file,\ open(os.path.join(self.data_dir, sub_dir, sub_dir + ".answer"), "w", encoding="utf-8") as answer_file: # loop over the data for article in tqdm.tqdm(self.data["data"]): context = article["text"] context_tokens = word_tokenize(context) context_sentences = sent_tokenize(context) if config.paragraph and ( len(context_tokens) < config.min_len_context or len(context_tokens) > config.max_len_context): continue spans = convert_idx(context, context_tokens) num_tokens = 0 first_token_sentence = [] for sentence in context_sentences: first_token_sentence.append(num_tokens) num_tokens += len(sentence) if not article["type"] == sub_dir: continue for question in article["questions"]: if question.get("isQuestionBad") == 0 and question[ "consensus"].get("s"): q = question["q"].strip() if q[-1] != "?" or len(q.split( )) < config.min_len_question or len( q.split()) > config.max_len_question: continue answer_start = question["consensus"]["s"] answer = context[question["consensus"]["s"]: question["consensus"]["e"]].strip( ".| ").strip("\n") answer_stop = answer_start + len(answer) # Getting spans of the answer in the context answer_span = [] for idx, span in enumerate(spans): if not (answer_stop <= span[0] or answer_start >= span[1]): answer_span.append(idx) if not answer_span: continue # Getting the sentence where we have the answer sentence_tokens = [] for idx, start in enumerate(first_token_sentence): if answer_span[0] >= start: sentence_tokens = context_sentences[idx] answer_sentence_span = [ span - start for span in answer_span ] else: break # write to file sent = [] for idx, token in enumerate(sentence_tokens): if token.strip("\n").strip(): if idx in answer_sentence_span: sent.append(token + u"│" + "1") else: sent.append(token + u"│" + "0") sent = " ".join(sent) sent = sent.strip() index = sent.find("(│0 CNN│0 )│0 --│0 ") if index > -1: sent = sent[index + len("(│0 CNN│0 )│0 --│0 "):] ctxt = [] for idx, token in enumerate(context_tokens): if token.strip("\n").strip(): if idx in answer_span: ctxt.append(token + u"│" + "1") else: ctxt.append(token + u"│" + "0") ctxt = " ".join(ctxt) ctxt = ctxt.strip() index = ctxt.find("(│0 CNN│0 )│0 --│0 ") if index > -1: ctxt = ctxt[index + len("(│0 CNN│0 )│0 --│0 "):] context_file.write(ctxt + "\n") sentence_file.write(sent + "\n") question_file.write(q + "\n") answer_file.write(answer + "\n")
def process_file(filename, data_type, word_counter, char_counter, ques_limit): """ 从文本文件中读取内容后进行初步处理。如果数据集是train的话,需要进行内容过滤 :param filename: :param data_type: :param word_counter: :param char_counter: :return: """ print("Processing {} examples...".format(data_type)) examples = [] eval_examples = {} total = 0 with open(filename, "r") as fh: source = json.load(fh) # TODO 预处理中进行了过滤,但后续没有办法计算spans for article in tqdm(source): content = article['article_title'] + '。' + article[ 'article_content'] content_tokens = word_tokenize(content) content_chars = [list(token) for token in content_tokens] spans = convert_idx(content, content_tokens) for token in content_tokens: word_counter[token] += len(article['questions']) for char in token: char_counter[char] += len(article["questions"]) for q in article['questions']: question_text = q["question"] answer_text = q['answer'] question_tokens = word_tokenize(question_text) question_tokens = shrink_question_tokens( question_tokens, ques_limit) question_chars = [list(token) for token in question_tokens] result = list(substring_indexes(answer_text, content)) for token in question_tokens: word_counter[token] += 1 for char in token: char_counter[char] += 1 if len(result) == 1: # 将result的字符转换成分词之后的位置,y1 y2 分别是开始的分词位置和结束的位置 current_pos, start_token, end_token = 0, -1, -1 for token_cnt, token in enumerate(content_tokens): if current_pos > result[0] and start_token == -1: start_token = token_cnt - 1 if current_pos > result[0] + len(q["answer"]): end_token = token_cnt - 2 break current_pos += len(token) total += 1 example = { "context_tokens": content_tokens, "context_chars": content_chars, "ques_tokens": question_tokens, "ques_chars": question_chars, "y1s": [start_token], "y2s": [end_token], "id": total } eval_examples[str(total)] = { "context": content, "spans": spans, # 全文的每个token与位置的对应关系 "answers": [answer_text], # TODO 改成不分para的 "uuid": q["questions_id"] } # example中没有存储原始的问题文本信息,在这里保存了,在后续的结果展示中可以用到。 examples.append(example) # 不考虑任何跨段的问题 random.shuffle(examples) print("{} questions in total".format(len(examples))) return examples, eval_examples
def split_sentence_question(self, filename, data_type): data = self.load_data(filename) with open(os.path.join(self.save_dir + data_type + '.sentence'), 'w', encoding="utf-8") as sentence_file,\ open(os.path.join(self.save_dir + data_type + '.question'), 'w', encoding="utf-8") as question_file: artilces = data for article in tqdm(artilces): paragraphs = article['paragraphs'] for paragraph in paragraphs: context = paragraph['context'] context = clean_text(context) context_tokens = word_tokenize(context) context_sentences = sent_tokenize(context) spans = convert_idx(context, context_tokens) num_tokens = 0 first_token_sentence = [] for sentence in context_sentences: first_token_sentence.append(num_tokens) num_tokens += len(sentence) question_and_answer_list = paragraph['qas'] for question_and_answer in question_and_answer_list: question = question_and_answer['question'] question = clean_text(question) question_tokens = word_tokenize(question) if len(question_tokens) > MAX_QUESTION_LENGTH or len( question_tokens) < MIN_QUESTION_LENGHT: continue if not question_and_answer['answers']: continue answer = question_and_answer['answers'][0] answer_text = answer['text'] answer_text = clean_text(answer_text) answer_tokens = word_tokenize(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) answer_span = [] for idx, span in enumerate(spans): if not (answer_stop <= span[0] or answer_start >= span[1]): answer_span.append(idx) if not answer_span: continue sentence_tokens = [] for idx, start in enumerate(first_token_sentence): if answer_span[0] >= start: sentence_tokens = context_sentences[idx] answer_sentence_span = [ span - start for span in answer_span ] else: break if not sentence_tokens: print("Sentence cannot be found") raise Exception() if len(sentence_tokens) > MAX_SENTENCE_LENGTH or len( sentence_tokens) < MIN_SENTENCE_LENGTH: continue sentence_file.write(" ".join([ token + u"│" + "1" if idx in answer_sentence_span else token + u"│" + "0" for idx, token in enumerate(sentence_tokens) ]) + "\n") question_file.write( " ".join([token for token in question_tokens]) + "\n")