def read_emr_entry(entry, is_training): """ reads input json to return list of examples where each example has context paragraph, question and answer span(if is_training is true)""" def is_whitespace(c): return c in " \t\r\n" or ord(c) == 0x202F examples = [] note_ids = entry['note_id'] # context_id for NQ dataset contexts = entry['context'] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in contexts: if(is_whitespace(c)): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens)-1) questions = [] for i, qas in enumerate(entry["qas"]): # qas is equivalent to questions in NQ dataset qas_id = qas['qas_id'] question_text = qas['question'] start_position = None end_position = None answer = None if is_training: answer= qas['answers'] orig_answer_text = answer["text"] answer_offset = answer["offset"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length-1] # if answer is None or answer.offset if(contexts[answer_offset:answer_offset+answer_length].lower() != " ".join(doc_tokens[start_position:(end_position + 1)]).lower() or " ".join(doc_tokens[start_position:(end_position + 1)]).lower() != answer["text"].lower()): continue actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join(tokenization.whitespace_tokenize(answer["text"])) questions.append(question_text) example = EmrExample( context_id=int(note_ids), qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, answer=answer, start_position=start_position, end_position=end_position) examples.append(example) return examples
def read_emr_entry(entry, is_training): """ creates multiple training examples from given paragraph and question answer pairs from that paragraph """ def is_whitespace(c): return c in " \t\r\n" or ord(c) == 0x202F examples = [] note_ids = entry['note_id'] contexts = entry['context'] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in contexts: if (is_whitespace(c)): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) questions = [] for i, qas in enumerate(entry["qas"]): qas_id = qas['qas_id'] question_text = qas['question'] start_position = None end_position = None answer = None if is_training: answer = qas['answers'] orig_answer_text = answer["text"] answer_offset = answer["offset"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length] # if answer is None or answer.offset if(contexts[answer_offset:answer_offset+answer_length].lower() != " ".join(doc_tokens[start_position:(end_position + 1)]).lower() or " ".join(doc_tokens[start_position:(end_position + 1)]).lower() != answer["text"].lower()): continue actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join(tokenization.whitespace_tokenize(answer["text"])) questions.append(question_text) example = EmrExample( context_id=int(note_ids), qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, answer=answer, start_position=start_position, end_position=end_position) examples.append(example) return examples
def tokenize(self, text, evidences): text = convert_to_unicode(text) text = self._clean_text(text) orig_tokens = whitespace_tokenize(text) split_tokens = [] split_rations = [] for token_idx, token in enumerate(orig_tokens): if self.do_lower_case: token = token.lower() token = self._run_strip_accents(token) sub_tokens = self._run_split_on_punc(token) sub_tokens = ' '.join(sub_tokens).strip().split() if len(sub_tokens) > 0: split_tokens.extend(sub_tokens) ration = self._is_token_rational(token_idx, evidences) split_rations.extend([ration] * len(sub_tokens)) return zip(split_tokens, split_rations)
def tokenize(self, text): text = convert_to_unicode(text) text = self._clean_text(text) orig_tokens = whitespace_tokenize(text) orig_tokens, orig_rations = self._parse_rations(orig_tokens) split_tokens = [] split_rations = [] for token, ration in zip(orig_tokens, orig_rations): if self.do_lower_case: token = token.lower() token = self._run_strip_accents(token) sub_tokens = self._run_split_on_punc(token) sub_tokens = ' '.join(sub_tokens).strip().split() if len(sub_tokens) > 0: split_tokens.extend(sub_tokens) split_rations.extend([ration] * len(sub_tokens)) return zip(split_tokens, split_rations)
def read_squad_examples(input_file, is_training, max_examples=None, writing_dev=False): """Read a SQuAD json file into a list of SquadExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible) and not writing_dev: raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) if max_examples is not None and len(examples) == max_examples: return examples return examples
def read_nq_entry(entry, is_training): """Converts a NQ entry into a list of NqExamples.""" def is_whitespace(c): return c in " \t\r\n" or ord(c) == 0x202F examples = [] contexts_id = entry["id"] contexts = entry["contexts"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in contexts: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) questions = [] for i, question in enumerate(entry["questions"]): qas_id = "{}".format(contexts_id) question_text = question["input_text"] start_position = None end_position = None answer = None if is_training: answer_dict = entry["answers"][i] answer = make_nq_answer(contexts, answer_dict) # For now, only handle extractive, yes, and no. if answer is None or answer.offset is None: continue start_position = char_to_word_offset[answer.offset] end_position = char_to_word_offset[answer.offset + len(answer.text) - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(answer.text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue questions.append(question_text) example = NqExample( example_id=int(contexts_id), qas_id=qas_id, questions=questions[:], doc_tokens=doc_tokens, doc_tokens_map=entry.get("contexts_map", None), answer=answer, start_position=start_position, end_position=end_position) examples.append(example) return examples
def read_contract_examples(input_file, is_training): """Read a Contract json file into a list of Contract Example.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] for c in paragraph_text: doc_tokens.append(c.lower()) char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if FLAGS.version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"].lower() answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = "".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = ContractExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples
def read_examples(input_file): """Read a SQuAD-like json file into a list of Examples.""" def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False reader = tf.gfile.Open(input_file, "r") if input_file.endswith(".gz"): reader = gzip.GzipFile(fileobj=reader) reader.next() examples = [] for line in tqdm(reader): item = json.loads(line.strip()) paragraph_text = item["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in item["qas"]: qas_id = qa["qid"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False start_position = -1 end_position = -1 orig_answer_text = "" if FLAGS.version_2_with_negative: is_impossible = qa["is_impossible"] if not is_impossible: answer_offset = qa["detected_answers"][0]["char_spans"][0][0] answer_end = qa["detected_answers"][0]["char_spans"][0][1] answer_length = answer_end - answer_offset + 1 orig_answer_text = item["context"][answer_offset:answer_end + 1] start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning("Example %d", len(examples)) tf.logging.warning(json.dumps(item, indent=2)) tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = Example(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) reader.close() return examples
def _read_json(self, input_file:str, is_training:bool, do_lower_case:bool): with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] examples = [] for entry in tqdm(input_data,desc=input_file): for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] raw_doc_tokens = customize_tokenizer(paragraph_text) doc_tokens = [] char_to_word_offset = [] k = 0 temp_word = "" for c in paragraph_text: if tokenization._is_whitespace(c): char_to_word_offset.append(k - 1) continue else: temp_word += c char_to_word_offset.append(k) if do_lower_case: temp_word = temp_word.lower() if temp_word == raw_doc_tokens[k]: doc_tokens.append(temp_word) temp_word = "" k += 1 assert k == len(raw_doc_tokens) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: answer = qa["answers"][0] orig_answer_text = answer["text"] if orig_answer_text not in paragraph_text: tf.logging.warning("Could not find answer") else: answer_offset = paragraph_text.index(orig_answer_text) answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # 跳过不符合要求的实例 actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = "".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] is_impossible = None label = qa["class"] label_id = LABEL_LIST.index(label) # Even though answers are not used for membership classification, # we keep this to ensure consistency with training procedure if is_training: if FLAGS.version_2_with_negative: is_impossible = qa["is_impossible"] if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadMembershipExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, label_id=label_id) examples.append(example) random.shuffle(examples) return examples