def tokenize(self, text): """Tokenizes a piece of text.""" text = self._clean_text(text) # This was added on November 1st, 2018 for the multilingual and Chinese # models. This is also applied to the English models now, but it doesn't # matter since the English models were not trained on any Chinese data # and generally don't have any Chinese data in them (there are Chinese # characters in the vocabulary because Wikipedia does have some Chinese # words in the English Wikipedia.). text = self._tokenize_chinese_chars(text) orig_tokens = btok.whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: # pass MASK forward if MASK in token: split_tokens.append(MASK) if token != MASK: remaining_chars = token.replace(MASK, "").strip() if remaining_chars: split_tokens.append(remaining_chars) continue if self.do_lower_case: token = token.lower() token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token)) output_tokens = btok.whitespace_tokenize(" ".join(split_tokens)) return output_tokens
def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def tokenize(self, text): orig_tokens = tokenization.whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: for sub_token in self.wordpiece_tokenizer.tokenize(token): split_tokens.append(sub_token) return split_tokens
def read_many_examples(input_file, is_training): '''who was the american in space ? in space''' lines_list = span_utils.read_cols_lines(input_file=input_file) examples = [] for i in range(len(lines_list)): paragraph_text = lines_list[i][0] answer_text = lines_list[i][1] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if span_utils.is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) if is_training: qas_id = 'train_' + str(i) else: qas_id = 'test_' + str(i) # question_text = 'abc' #no use start_position = None end_position = None orig_answer_text = None if is_training: # if len(answer_text) != 1: # raise ValueError("For training, each question should have exactly 1 answer.") orig_answer_text = answer_text answer_offset = paragraph_text.find(answer_text) # answer_start answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] print(paragraph_text, '\t', answer_text) end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: continue else: orig_answer_text = answer_text example = SequenceExample( qas_id=qas_id, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def get_start_end_and_tokens(paragraph_text, question_text, orig_answer_text, answer_offset, tokenizer, improve_flag=False): doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) start_position = None end_position = None answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text)) tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) tok_start_position = None tok_end_position = None if True: tok_start_position = orig_to_tok_index[start_position] if end_position < len(doc_tokens) - 1: tok_end_position = orig_to_tok_index[end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 if (improve_flag): tok_start_position, tok_end_position = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, orig_answer_text) # Adjustment tokenized_question = tokenizer.tokenize(question_text) tokenized_para = tokenizer.tokenize(paragraph_text) tok_start_position = tok_start_position + 2 + len( tokenized_question) # added 2 for [CLS] and [SEP] token tok_end_position = tok_end_position + 2 + len(tokenized_question) return tokenized_question, tokenized_para, tok_start_position, tok_end_position
def tokenize(self, text): """Tokenizes a piece of text.""" text = self._clean_text(text) # This was added on November 1st, 2018 for the multilingual and Chinese # models. This is also applied to the English models now, but it doesn't # matter since the English models were not trained on any Chinese data # and generally don't have any Chinese data in them (there are Chinese # characters in the vocabulary because Wikipedia does have some Chinese # words in the English Wikipedia.). text = self._tokenize_chinese_chars(text) orig_tokens = whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: if self.do_lower_case and token not in self.never_split: token = token.lower() token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token)) output_tokens = whitespace_tokenize(" ".join(split_tokens)) return output_tokens
def icd9_tokenizer_style(vocab, text_a): # we follow same tokenization approach in original paper # @Vocab is some object that convert words to index exactly in the order of the pretrained word vectors. # use Vocab = load_vocab('/local/datdb/MIMIC3database/format10Jan2019/vocab+icd_index_map.txt') # @text_a can be split by space (in case of preprocessed icd9 notes) tokens_a = whitespace_tokenize(text_a) input_ids = [] for token in tokens_a: if token in vocab: input_ids.append(vocab[token]) else: input_ids.append(1) # unknown return input_ids, len(input_ids)
def tokenize(self, text): """Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform tokenization using the given vocabulary. For example: input = "unaffable" output = ["un", "##aff", "##able"] Args: text: A single token or whitespace separated tokens. This should have already been passed through `BasicTokenizer`. Returns: A list of wordpiece tokens. """ output_tokens = [] for token in whitespace_tokenize(text): chars = list(token) if len(chars) > self.max_input_chars_per_word: output_tokens.append(self.unk_token) continue is_bad = False start = 0 sub_tokens = [] while start < len(chars): end = len(chars) cur_substr = None while start < end: substr = "".join(chars[start:end]) if start > 0: substr = "##" + substr if substr in self.vocab: cur_substr = substr break end -= 1 if cur_substr is None: is_bad = True break sub_tokens.append(cur_substr) start = end if is_bad: output_tokens.append(self.unk_token) else: output_tokens.extend(sub_tokens) return output_tokens
def read_many_examples(input_file, is_training): '''2019.06.19''' lines_list = span_utils.read_cols_lines(input_file=input_file) examples = [] for i in range(len(lines_list)): line_list = lines_list[i] paragraph_text = line_list[0] question_text = line_list[1] answer_text = line_list[2] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if span_utils.is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) if is_training: qas_id = 'train_' + str(i) else: qas_id = 'test_' + str(i) start_position = None end_position = None orig_answer_text = None if is_training: if len(answer_text) == 0: raise ValueError('For training, each question should have exactly 1 answer.') orig_answer_text = answer_text # answer_offset = paragraph_text.find(answer_text) answer_offset = span_utils.duplicate_word(paragraph_text=paragraph_text, span=question_text, headword=answer_text) answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: continue else: orig_answer_text = answer_text example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, doc_char_to_word_offset=char_to_word_offset, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def read_squad_examples(input_file, is_training, version_2_with_negative): a = 0 quest = [] abstract = [] abstract_node = [] # 新加 answer = [] with open(input_file, encoding='utf-8') as f: content = f.read() text = json.loads(content) for content in text: for key in content: if key != "Abstract" and key != "am_id" and key != "transH" and key != "transE" and key != "metapth2vec": abstract.append(content["Abstract"]) if content["am_id"] == None: abstract_node.append([0] * 100) else: abstract_node.append(content["metapth2vec"]) # 新加 #print(type(content["transH"])) # list #print(content["transH"]) quest.append(questionbox[key]) answer.append(content[key]) total = len(quest) def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] i = 0 while i < total: paragraph_text = abstract[i] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True #对句子作token,去除空格 for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) qas_id = str(i) question_text = quest[i] node = abstract_node[i] # 新加 start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: if answer[i] == '': is_impossible = True if not is_impossible: #answer = qa["answers"][0] orig_answer_text = answer[i] answer_offset = paragraph_text.find(orig_answer_text) if (answer_offset == -1): print('-----------------------------') print(i, ' hehe') print(paragraph_text) print(orig_answer_text) print(type(orig_answer_text)) answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: a += 1 logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) i += 1 continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, abstract_node=node, # 新加 orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) i += 1 print(a) return examples
def read_thai_qa_examples(input_file, is_training): """Read a Thai QA pickle file into a list of ThaiQAExample.""" def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False input_data = pd.read_pickle(input_file) examples = [] for _, entry in input_data.iterrows(): start_position = None end_position = None paragraph_text = entry['paragraph'].replace('\n', '') question_id = entry['question_id'] question_text = entry['question'] try: orig_answer_text = entry['answer'] except: orig_answer_text = None if entry['lang'] == 'thai': doc_tokens = word_tokenize(paragraph_text, engine='ulmfit') if is_training: char_to_word_offset = [] for token_index, token in enumerate(doc_tokens): for c in token: if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: char_to_word_offset.append(token_index - 1) else: char_to_word_offset.append(token_index) answer_offset = entry['start_pos'] orig_answer_text = entry['answer'] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] elif entry['lang'] != 'thai': doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) if is_training: answer_offset = entry['start_pos'] answer_length = len(orig_answer_text) try: start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] except: start_position = -1 end_position = -1 orig_answer_text = "" # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = ThaiQAExample( qas_id=question_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def read_squad_examples(input_file, is_training=True, version_2_with_negative=True): """ :param input_file: 待读取文件训练集路径 :param is_training: 是否为训练集 :return: """ with open(input_file, "r", encoding='utf-8') as reader: # dataset: 数据集中所有案例 dataset = json.load(reader)["data"] #print(len(dataset)) #2000 examples = [] # 分析提取dataset中的每个案例 for item in dataset: for paragraph in item['paragraphs']: content_text = paragraph['context'] qas = paragraph['qas'] doc_tokens = [] char_to_word_offset = [] # doc_token是 for word in content_text: doc_tokens.append(word) char_to_word_offset.append(len(doc_tokens) - 1) # qas是一个案例下的所有问题,qa是所有问题中的一个问题 for qa in qas: qa_id = qa['id'] question_text = qa['question'] # 参数自定义 start_position = None end_position = None answer = None is_impossible = False is_yes = False is_no = False # 假如读入的数据集属于训练数据 if is_training: if version_2_with_negative: if qa['is_impossible'] == 'false': is_impossible = False else: is_impossible = True # for training, each question should have exactly 1 answer if (len(qa['answers']) != 1) and (not is_impossible): continue if not is_impossible: ans = qa['answers'][0] answer = ans['text'] answer_start = ans['answer_start'] answer_length = len(answer) start_position = char_to_word_offset[answer_start] end_position = char_to_word_offset[answer_start + answer_length - 1] real_answer = "".join( doc_tokens[start_position:end_position + 1]) clean_answer = " ".join( whitespace_tokenize(answer)) # 如果抽取出来的答案 与 材料提供的数据 不能匹配 if real_answer.find(clean_answer) == -1: if (clean_answer == 'YES'): is_yes = True answer = 'YES' start_position = -1 end_position = -1 elif clean_answer == 'NO': is_no = True answer = 'NO' start_position = -1 end_position = -1 else: logger.warning( "could not find answer: '%s' vs. '%s'", real_answer, clean_answer) continue else: start_position = -1 end_position = -1 answer = "" # if training example = SquadExample( qa_id=qa_id, question_text=question_text, doc_tokens=doc_tokens, answer=answer, start_position=start_position, end_position=end_position, is_impossible=is_impossible, is_yes=is_yes, is_no=is_no, ) examples.append(example) """ example's key: qa_id: ... , question: ... """ # for qa in qas # for paragraph # for item return examples
def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] # wikipidia的一篇文章 for entry in input_data: # 一篇文章中的一段内容 for paragraph in entry["paragraphs"]: # 文章的具体内容 paragraph_text = paragraph["context"] doc_tokens = [] # 从char的index到word的index,需要考虑空白的影响 char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False # 每次读一个char都需要记录一次 char_to_word_offset.append(len(doc_tokens) - 1) # 遍历每一个 问题-答案 for qa in paragraph["qas"]: qas_id = qa["id"] # 问题的内容 question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: # 训练数据集只有一个答案 # dev数据集每一个问题有三个答案,但是有些答案是相同的 if len(qa["answers"]) != 1: raise ValueError( "For training, each question should have exactly 1 answer.") # 答案 answer = qa["answers"][0] orig_answer_text = answer["text"] # 答案开始位置 answer_offset = answer["answer_start"] # 答案的字符长度 answer_length = len(orig_answer_text) # 答案开始的token位置 start_position = char_to_word_offset[answer_offset] # 答案接受的token的位置 end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. # 提取出来的答案token actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) # 这个是真的原始答案 # whitespace_tokenize 仅仅去除了空格,然后split一个array cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: # 应该不会出现吧 logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = SquadExample( qas_id=qas_id, # 问答对的唯一id question_text=question_text, # 问题字符串 doc_tokens=doc_tokens, # passage的token数组 orig_answer_text=orig_answer_text, # 原始文本字符串 start_position=start_position, # 开始位置,在token数组中的位置 end_position=end_position) # 结束位置,在token数组中的位置 examples.append(example) return examples
def parse_json_squad(input_data, is_train): """Read a SQuAD json file into a list of SquadExample.""" examples = list() for data_entry in input_data: for paragraph in data_entry['paragraphs']: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True # Q1. doc_tokens에 whitespace(c)를 가지고 context를 토큰화하는 코드를 작성하세요. ################################################################################################### for char in paragraph_text: if is_whitespace(char): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(char) else: doc_tokens[-1] += char prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # Which word is the character in? ################################################################################################### for qa in paragraph["qas"]: """ {'answers': [{'answer_start', 'text'}], 'question', 'id'} """ qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_train: if len(qa["answers"]) != 1: raise ValueError( "For training, each question should have exactly 1 answer.") # Q2. Line 34의 변수를 참고하여 Line 70: SquadExample의 instance를 만들기 위한 파라미터를 채우세요. ################################################################################################### qas_id = qa["id"] # fill the black -> assign None question_text = qa["question"] # fill the black # index of word answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] # index of word end_position = char_to_word_offset[answer_offset + answer_length - 1] # index of word ################################################################################################### # CODE FOR Handling exceptions # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize( orig_answer_text)) # segment words from the sentense including the white space if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, # a set of tokens(words) in the orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) print("success to convert input data into a set of {} examples".format(len(examples))) return examples
def read_squad_examples(input_file, is_training, version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] # read all the data in the train file for num, entry in enumerate(tqdm(input_data, desc="Data")): for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] # Added by Yue, extracting features from paragraph text paragraph_features = extract_feature_matrix(paragraph_text) # paragraph_text is the paragraph context doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: # for a given question, we have id, start position, end_postion qas_id = qa["id"] question_text = qa["question"] # Added by Yue, extracting features from quetsion text question_features = extract_feature_matrix(question_text) start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" total_features = concatenate_features(question_features, paragraph_features) # example is the original for a certain example # containning, qas_id, question_text, example = SquadExample( # Added by Yue qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible, ling_features=total_features) examples.append(example) return examples
def read_squad_examples(input_file, is_training, version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens, char_to_word_offset = split_by_space(paragraph_text) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples
def tokenize(self, text): return whitespace_tokenize(text)
def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" """The following is the arch of the element in the list: ``` { "title": "University_of_Notre_Dame", "paragraphs": [ { "context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.", "qas": [ { "answers": [ { "answer_start": 515, "text": "Saint Bernadette Soubirous" } ], "question": "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?", "id": "5733be284776f41900661182" }, ... ... ] }, ... ... } ``` """ with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] # list, len=442 def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: if len(qa["answers"]) != 1: raise ValueError( "For training, each question should have exactly 1 answer." ) answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def read(self, input_file, read_state, sample_ratio: float = 0.5, dialog_turns: int = 2, extra_sen_file: str = None) -> List[QAFullExample]: """ :param input_file: input file to load data. The format is in CoQA style :param read_state: If read extra sentences from CoQA dataset. :param sample_ratio: the ratio of negative sampling. :param dialog_turns: Decide how many turns' questions and answers will be appended before current question. :param extra_sen_file: If read_extra_self is False, then this parameter must be specified as the way path for extra sentence file. """ logger.info('Reading data set from {}...'.format(input_file)) logger.info('Read parameters:') logger.info('Dialog turns: {}'.format(dialog_turns)) logger.info('Read state: {}'.format(read_state)) logger.info('Sample ratio: {}'.format(sample_ratio)) logger.info('Extra sentence file: {}'.format(extra_sen_file)) assert read_state in ReadState with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)['data'] def is_whitespace(ch): if ch == " " or ch == "\t" or ch == "\r" or ch == "\n" or ord(ch) == 0x202F: return True return False all_sentences = [] if read_state == ReadState.SampleFromSelf: for paragraph in input_data: for sentence in self.sentence_tokenizer.tokenize(paragraph['story']): sentence_tokens = whitespace_tokenize(sentence) if sentence_tokens: all_sentences.append(sentence_tokens) else: logger.warning('Empty sentence!') # all_sentences.extend( # [whitespace_tokenize(sentence) for sentence in self.sentence_tokenizer.tokenize(paragraph['story'])]) elif read_state == ReadState.SampleFromExternal: pass logger.info('Read extra sentences: {}'.format(len(all_sentences))) examples = [] for paragraph in input_data: paragraph_text = paragraph["story"] story_id = paragraph['id'] doc_tokens = [] prev_is_whitespace = True char_to_word_offset = [] for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # Split context into sentences sentence_start_list, sentence_end_list = utils.split_sentence(paragraph_text, self.sentence_tokenizer) sentence_span_list = [] for c_start, c_end in zip(sentence_start_list, sentence_end_list): t_start = char_to_word_offset[c_start] t_end = char_to_word_offset[c_end] sentence_span_list.append((t_start, t_end)) doc_sentence_tokens = [doc_tokens[span[0]: (span[1] + 1)] for span in sentence_span_list] questions = paragraph['questions'] answers = paragraph['answers'] for i, (question, answer) in enumerate(zip(questions, answers)): question_text = question['input_text'] # We are only concerned about questions with Yes/No as answers answer_type = utils.normalize_answer(answer['input_text']) if answer_type not in ['yes', 'no']: continue if answer_type == 'yes': answer_choice = 0 else: answer_choice = 1 for j in range(dialog_turns): pre_idx = i - (j + 1) if pre_idx >= 0: question_text = questions[pre_idx]['input_text'] + '<Q>' + answers[pre_idx][ 'input_text'] + '<A>' + question_text qas_id = story_id + '--' + str(i + 1) # Add rationale start and end as extra supervised label. rationale_start_position = char_to_word_offset[answer['span_start']] rationale_end_position = char_to_word_offset[answer['span_end'] - 1] sentence_id = utils.find_evidence_sentence(sentence_span_list, rationale_start_position, rationale_end_position) # Add negative samples if read_state != ReadState.NoNegative: new_doc_tokens, sentence_label, new_sentence_id, sentence_span_list, orig_token_map = \ utils.generate_seq_with_negative_sample(doc_sentence_tokens, all_sentences, sample_ratio, target_index=sentence_id) rationale_start_position = orig_token_map[rationale_start_position] rationale_end_position = orig_token_map[rationale_end_position] else: new_doc_tokens = doc_tokens sentence_label = [0] * len(sentence_span_list) new_sentence_id = sentence_id example = QAFullExample( qas_id=qas_id, question_text=question_text, doc_tokens=new_doc_tokens, sentence_span_list=sentence_span_list, orig_answer_text="", start_position=None, end_position=None, sentence_id=new_sentence_id, is_impossible=answer_choice, ral_start_position=rationale_start_position, ral_end_position=rationale_end_position, meta_data={'sentence_label': sentence_label}) examples.append(example) return examples
fin = open("/u/scratch/d/datduong/w2vModel1Gram9Jan2019/vocab.txt", "r") counter = 0 for line in tqdm(fin): if counter == 0: counter = 1 # skip header continue pubmed_vocab.append(line.split()[0]) fin.close() pubmed_vocab = set(pubmed_vocab) ## read in def, do white space, intersect with pubmed GOdb_vocab = [] GOdb = pd.read_csv("go_def_in_obo.tsv", sep="\t") for defin in list(GOdb['def']): token = whitespace_tokenize(defin) token = list(set(token)) GOdb_vocab = GOdb_vocab + token GOdb_vocab = set(GOdb_vocab) GOdb_vocab = GOdb_vocab.intersection(pubmed_vocab) GOdb_vocab = list(GOdb_vocab) GOdb_vocab.sort() GOdb_vocab = ['[PAD]', '[UNK]'] + GOdb_vocab ## ADD PADDING fout = open('word_pubmed_intersect_GOdb.txt', 'w') fout.write("\n".join(s for s in GOdb_vocab)) fout.close() ## create init embed.
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--inputs', required=True, nargs='+', help='files to process.', ) parser.add_argument( '--output', required=True, metavar='DIR', help='Path for output', ) args = parser.parse_args() print(args) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') def process(s): try: return tokenizer.tokenize(s) except: print('failed on', s) raise def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False for inp in args.inputs: bad_qs = 0 num_qs = 0 filename = os.path.basename(inp) base_filename = os.path.splitext(filename)[0] s1_filename = base_filename + '_1.txt' s2_filename = base_filename + '_2.txt' s3_filename = base_filename + '_3.txt' s4_filename = base_filename + '_4.txt' id_filename = base_filename + '.id' label_filename = base_filename + '.lbl' with open(inp, 'r') as f_in, open(os.path.join(args.output, s1_filename), 'w') as s1_out, open(os.path.join(args.output, s2_filename), 'w') as s2_out, open(os.path.join(args.output, id_filename), 'w') as id_out, open(os.path.join(args.output, label_filename), 'w') as lbl_out, open(os.path.join(args.output, s3_filename), 'w') as s3_out,open(os.path.join(args.output, s4_filename), 'w') as s4_out: data = json.load(f_in) for example in data['data']: for p in example['paragraphs']: context = p['context'] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in context: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) orig_to_tok_index = [] tok_to_orig_index = [] all_doc_tokens = [] for (i, token) in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = process(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) for qa in p['qas']: num_qs += 1 q = process(qa['question']) is_impossible = True #qa['is_impossible'] answer = qa['answers'][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue tok_start_position = orig_to_tok_index[start_position] if end_position < len(doc_tokens) - 1: tok_end_position = orig_to_tok_index[end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, process, orig_answer_text) if not is_impossible: # print('bad question:', str(q)) bad_qs += 1 continue print(' '.join(all_doc_tokens), file=s1_out) print(' '.join(q), file=s2_out) print(' '.join(doc_tokens), file=s3_out) print(' '.join([str(ii) for ii in tok_to_orig_index]), file=s4_out) print(qa['id'], file=id_out) lbl_str = f'{int(is_impossible)}' lbl_str += f' {tok_start_position} {tok_end_position}' print(lbl_str, file=lbl_out) print('bad questions:', bad_qs, 'out of', num_qs)
def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: if len(qa["answers"]) != 1: raise ValueError( "For training, each question should have exactly 1 answer." ) answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) return examples
def read_squad_examples(input_data, is_training, version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" if type(input_data) == str: with open(input_data, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] elif type(input_data) == dict: question = input_data['question'] paragraphs = input_data['paragraphs'] examples = [] for p in paragraphs: examples.append({ 'context': p, 'qas': [{ u'answers': [], u'id': uuid.uuid4().hex, u'question': question, 'is_impossible': True, 'plausible_answers': [] }] }) input_data = [{'title': question, 'paragraphs': examples}] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: # logger.warning("Could not find answer: '%s' vs. '%s'", # actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples