def test_is_control(self): self.assertTrue(tokenization._is_control(u"\u0005")) self.assertFalse(tokenization._is_control(u"A")) self.assertFalse(tokenization._is_control(u" ")) self.assertFalse(tokenization._is_control(u"\t")) self.assertFalse(tokenization._is_control(u"\r"))
def customize_tokenizer(text, do_lower_case=False): tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) temp_x = "" text = tokenization.convert_to_unicode(text) for c in text: if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation(c) or tokenization._is_whitespace( c) or tokenization._is_control(c): temp_x += " " + c + " " else: temp_x += c if do_lower_case: temp_x = temp_x.lower() return temp_x.split()
def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" with tf.gfile.Open(input_file, "r") as reader: input_data = json.load(reader)["data"] # examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] # 分词 raw_doc_tokens = customize_tokenizer(paragraph_text, do_lower_case=False) doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True k = 0 temp_word = "" for c in paragraph_text: # 增加控制字符的判断 if tokenization._is_whitespace(c) or tokenization._is_control( c): char_to_word_offset.append(k - 1) continue else: temp_word += c char_to_word_offset.append(k) if temp_word == raw_doc_tokens[k]: doc_tokens.append(temp_word) temp_word = "" k += 1 print("k:%d;raw_doc_tokens:%d" % (k, len(raw_doc_tokens))) print("qas_id:%s" % (paragraph["qas"][0]["id"])) if (k != len(raw_doc_tokens)): print("doc_tokens:") print(doc_tokens) print("raw_doc_tokens:") print(raw_doc_tokens) assert k == len(raw_doc_tokens) # 答案的basic token位置 start_positions = [] end_positions = [] for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None for answer in qa["answers"]: orig_answer_text = answer["text"] if orig_answer_text not in paragraph_text: tf.logging.warning("Could not find answer") else: answer_offset = paragraph_text.index(orig_answer_text) answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = "".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue start_positions.append(start_position) end_positions.append(end_position) example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_positions, end_position=end_positions) examples.append(example) tf.logging.info("**********read_squad_examples complete!**********") return examples