def _add_examples(self, examples, example_failures, paragraph, split): paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = self._tokenizer.tokenize(token) for j, sub_token in enumerate(sub_tokens): tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) for qa in paragraph["qas"]: qas_id = qa["id"] if "id" in qa else None qid = qa["qid"] if "qid" in qa else None question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False plau_answer_text = plau_answer_start_w = plau_answer_end_w = None if split == "train": if self.v2: is_impossible = qa["is_impossible"] if not is_impossible: if "detected_answers" in qa: # MRQA format answer = qa["detected_answers"][0] answer_offset = answer["char_spans"][0][0] else: # SQuAD format answer = qa["answers"][0] answer_offset = answer["answer_start"] orig_answer_text = answer["text"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] if answer_offset + answer_length - 1 >= len( char_to_word_offset): utils.log("End position is out of document!") example_failures[0] += 1 continue end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) actual_text = actual_text.lower() cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: utils.log("Could not find answer: '{:}' in doc vs. " "'{:}' in provided answer".format( tokenization.printable_text(actual_text), tokenization.printable_text( cleaned_answer_text))) example_failures[0] += 1 continue else: start_position = -1 end_position = -1 orig_answer_text = "" plausible_answers = qa.get("plausible_answers", None) if plausible_answers: plau_answer_text = plausible_answers[0]["text"] plau_answer_start = plausible_answers[0][ "answer_start"] plau_answer_length = len(plau_answer_text) if plau_answer_start + plau_answer_length - 1 >= len( char_to_word_offset): tf.logging.warning("plausible answer error, pass.") plau_answer_text = plau_answer_start_w = plau_answer_end_w = None else: plau_answer_start_w = char_to_word_offset[ plau_answer_start] plau_answer_end_w = char_to_word_offset[ plau_answer_start + plau_answer_length - 1] actual_text = " ".join( doc_tokens[plau_answer_start_w:( plau_answer_end_w + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize( plau_answer_text)) actual_text = actual_text.lower() cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning( "plausible answer error, pass.") plau_answer_text = plau_answer_start_w = plau_answer_end_w = None example = QAExample( task_name=self.name, eid=len(examples), qas_id=qas_id, qid=qid, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible, all_doc_tokens=all_doc_tokens, orig_to_tok_index=orig_to_tok_index, tok_to_orig_index=tok_to_orig_index, plau_answer_start=plau_answer_start_w, plau_answer_text=plau_answer_text, plau_answer_end=plau_answer_end_w, ) examples.append(example)
def _add_examples(self, examples, example_failures, paragraph, split): paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True if self.name in [ "sacqa", "cmrc2018", "ccks42ee", "ccks42single", "ccks42multi" ]: # for chinese prev_is_chinese = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace or prev_is_chinese or is_chinese_char( c): doc_tokens.append(c) prev_is_chinese = True if is_chinese_char(c) else False else: doc_tokens[-1] += c prev_is_chinese = False prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) else: for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] if "id" in qa else None qid = qa["qid"] if "qid" in qa else None question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if split == "train": if self.v2: is_impossible = qa["is_impossible"] if not is_impossible: if "detected_answers" in qa: # MRQA format answer = qa["detected_answers"][0] answer_offset = answer["char_spans"][0][0] else: # SQuAD format answer = qa["answers"][0] answer_offset = answer["answer_start"] orig_answer_text = answer["text"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] if answer_offset + answer_length - 1 >= len( char_to_word_offset): utils.log("End position is out of document!") example_failures[0] += 1 continue end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. if self.name in [ "sacqa", "cmrc2018", "ccks42ee", "ccks42single", "ccks42multi" ]: # for chinese, no whitespace needed actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = "".join( tokenization.whitespace_tokenize(orig_answer_text)) else: actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) actual_text = actual_text.lower() cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: utils.log( "Could not find answer: '{:}': '{:}' in doc vs. " "'{:}' in provided answer".format( qas_id, tokenization.printable_text(actual_text), tokenization.printable_text( cleaned_answer_text))) example_failures[0] += 1 continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = QAExample(task_name=self.name, eid=len(examples), qas_id=qas_id, qid=qid, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example)
def _add_examples(self, examples, example_failures, paragraph, split): paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # def parse(sentence): # """ 解析一个句子,返回dependence heads etc """ # doc = nlp(sentence) # heads = [] # words = [] # for sent in doc.sentences: # heads_tmp = [] # num_tmp = sum([len(x) if x else 0 for x in heads]) # for word in sent.words: # words.append(word.text) # if word.head == 0: # heads_tmp.append(0) # else: # heads_tmp.append(word.head + num_tmp) # heads.append(heads_tmp) # heads = reduce(lambda x, y: x + y, heads) # return heads, words # # def parse_and_trim(tokens): # """ 输入空格分词后的tokens list, parse后按照输入调整heads """ # heads, words = parse(" ".join(tokens)) # t2w = {} # w2t = {} # ti = 0 # wi = 0 # last_move = None # 交替移动指针的控制 # while (ti < len(tokens)) and (wi < len(words)): # if tokens[ti] == words[wi]: # t2w[ti] = wi # w2t[wi] = ti # ti += 1 # wi += 1 # last_move = None # elif tokens[ti] in words[wi]: # t2w[ti] = wi # if wi not in w2t: # w2t[wi] = ti # ti += 1 # last_move = 't' # elif words[wi] in tokens[ti]: # w2t[wi] = ti # if ti not in t2w: # t2w[ti] = wi # wi += 1 # last_move = 'w' # else: # if last_move == 'w': # ti += 1 # last_move = 't' # elif last_move == 't': # wi += 1 # last_move = 'w' # else: # wi += 1 # ti += 1 # last_move = None # heads_ = [] # for ti in range(len(tokens)): # wi = t2w.get(ti, None) # if wi is not None: # h = heads[wi] # if h == 0: # heads_.append(0) # else: # h_ = w2t.get(h - 1, None) # if h_ is not None: # heads_.append(h_ + 1) # else: # heads_.append(ti + 1) # else: # heads_.append(ti + 1) # return heads_ # # def heads_2_dep_matrix(heads): # """ 将dependence heads转换为dependence matrix """ # arr = np.diag((1,) * len(heads)) # for i, j in enumerate(heads): # if j != 0: # arr[i, j - 1] = 1 # while True: # 传递依赖 # arr1 = np.matmul(arr, arr) # arr1[arr1 > 1] = 1 # if (arr1 == arr).all(): # break # else: # arr = arr1 # return arr tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] # heads = parse_and_trim(doc_tokens) # dependence heads for (i, token) in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = self._tokenizer.tokenize(token) for j, sub_token in enumerate(sub_tokens): tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) # heads_piece = [] # last_orig_index = None # for ind in range(len(all_doc_tokens)): # orig_index = tok_to_orig_index[ind] # if orig_index == last_orig_index: # heads_piece.append(ind) # else: # h = heads[orig_index] # if h == 0: # heads_piece.append(0) # else: # heads_piece.append(orig_to_tok_index[h - 1] + 1) # last_orig_index = orig_index # all_doc_tokens_dep_mask = heads_2_dep_matrix(heads_piece) for qa in paragraph["qas"]: qas_id = qa["id"] if "id" in qa else None qid = qa["qid"] if "qid" in qa else None question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False plau_answer_text = plau_answer_start_w = plau_answer_end_w = None if split == "train": if self.v2: is_impossible = qa["is_impossible"] if not is_impossible: if "detected_answers" in qa: # MRQA format answer = qa["detected_answers"][0] answer_offset = answer["char_spans"][0][0] else: # SQuAD format answer = qa["answers"][0] answer_offset = answer["answer_start"] orig_answer_text = answer["text"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] if answer_offset + answer_length - 1 >= len( char_to_word_offset): utils.log("End position is out of document!") example_failures[0] += 1 continue end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) actual_text = actual_text.lower() cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: utils.log("Could not find answer: '{:}' in doc vs. " "'{:}' in provided answer".format( tokenization.printable_text(actual_text), tokenization.printable_text( cleaned_answer_text))) example_failures[0] += 1 continue else: start_position = -1 end_position = -1 orig_answer_text = "" plausible_answers = qa.get("plausible_answers", None) if plausible_answers: plau_answer_text = plausible_answers[0]["text"] plau_answer_start = plausible_answers[0][ "answer_start"] plau_answer_length = len(plau_answer_text) if plau_answer_start + plau_answer_length - 1 >= len( char_to_word_offset): tf.logging.waring("plausible answer error, pass.") plau_answer_text = plau_answer_start_w = plau_answer_end_w = None else: plau_answer_start_w = char_to_word_offset[ plau_answer_start] plau_answer_end_w = char_to_word_offset[ plau_answer_start + plau_answer_length - 1] actual_text = " ".join( doc_tokens[plau_answer_start_w:( plau_answer_end_w + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize( plau_answer_text)) actual_text = actual_text.lower() cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: tf.logging.waring( "plausible answer error, pass.") plau_answer_text = plau_answer_start_w = plau_answer_end_w = None example = QAExample( task_name=self.name, eid=len(examples), qas_id=qas_id, qid=qid, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible, all_doc_tokens=all_doc_tokens, orig_to_tok_index=orig_to_tok_index, tok_to_orig_index=tok_to_orig_index, # all_doc_tokens_dep_mask=all_doc_tokens_dep_mask, plau_answer_start=plau_answer_start_w, plau_answer_text=plau_answer_text, plau_answer_end=plau_answer_end_w, ) examples.append(example)