def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % (tokenization.printable_text( self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % self.start_position if self.start_position: s += ", end_position: %d" % self.end_position if self.start_position: s += ", is_impossible: %r" % self.is_impossible return s
def __str__(self): s = '' s += 'tokens: %s\n' % (' '.join( [tokenization.printable_text(x) for x in self.tokens])) s += 'segment_ids: %s\n' % (' '.join( [str(x) for x in self.segment_ids])) s += 'is_random_next: %s\n' % self.is_random_next s += 'masked_lm_positions: %s\n' % (' '.join( [str(x) for x in self.masked_lm_positions])) s += 'masked_lm_labels: %s\n' % (' '.join( [tokenization.printable_text(x) for x in self.masked_lm_labels])) s += '\n' return s
def __str__(self): s = "" s += "tokens: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.tokens])) s += "segment_ids: %s\n" % (" ".join( [str(x) for x in self.segment_ids])) s += "is_random_next: %s\n" % self.is_random_next s += "masked_lm_positions: %s\n" % (" ".join( [str(x) for x in self.masked_lm_positions])) s += "masked_lm_labels: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.masked_lm_labels])) s += "\n" return s
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", options: [%s]" % (" ".join(self.options)) if self.combination_options: s += ", combination_options: %s" % self.combination_options if self.evidences: s += ", evidences: %s" % self.evidences if self.answers: s += ", answers: %s" % self.answers return s
def print_tokens(inputs: Inputs, inv_vocab, updates_mask=None): """Pretty-print model inputs.""" pos_to_tokid = {} for tokid, pos, weight in zip( inputs.masked_lm_ids[0], inputs.masked_lm_positions[0], inputs.masked_lm_weights[0], ): if weight == 0: pass else: pos_to_tokid[pos] = tokid text = "" provided_update_mask = updates_mask is not None if not provided_update_mask: updates_mask = np.zeros_like(inputs.input_ids) for pos, (tokid, um) in enumerate(zip(inputs.input_ids[0], updates_mask[0])): token = inv_vocab[tokid] if token == "[PAD]": break if pos in pos_to_tokid: token = RED + token + " (" + inv_vocab[ pos_to_tokid[pos]] + ")" + ENDC if provided_update_mask: assert um == 1 else: if provided_update_mask: assert um == 0 text += token + " " utils.log(tokenization.printable_text(text))
def featurize(self, example: InputExample, is_training, log=False): """Turn an InputExample into a dict of features.""" # tokens_a = self._tokenizer.tokenize(example.text_a) tokens_b = None tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for seg_id, text in enumerate(example.text_a): tokens_a = self._tokenizer.tokenize(text) for token in tokens_a: tokens.append(token) segment_ids.append(seg_id) tokens.append("[SEP]") segment_ids.append(seg_id) if len(tokens) > self.config.max_seq_length: tokens = tokens[:self.config.max_seq_length - 1] + [tokens[-1]] segment_ids = segment_ids[:self.config.max_seq_length - 1] + [segment_ids[-1]] input_ids = self._tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < self.config.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == self.config.max_seq_length assert len(input_mask) == self.config.max_seq_length assert len(segment_ids) == self.config.max_seq_length if log: utils.log(" Example {:}".format(example.eid)) utils.log(" tokens: {:}".format(" ".join( [tokenization.printable_text(x) for x in tokens]))) utils.log(" input_ids: {:}".format(" ".join(map(str, input_ids)))) utils.log(" input_mask: {:}".format(" ".join( map(str, input_mask)))) utils.log(" segment_ids: {:}".format(" ".join( map(str, segment_ids)))) eid = example.eid features = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "task_id": self.config.task_names.index(self.name), self.name + "_eid": eid, } self._add_features(features, example, log) return features
def print_tokens(inputs: Inputs, inv_vocab, updates_mask=None): """Pretty-print model inputs.""" batch_size = len(inputs.masked_lm_ids) provided_update_mask = (updates_mask is not None) if not provided_update_mask: updates_mask = np.zeros_like(inputs.input_ids) for i in range(batch_size): pos_to_tokid = {} for tokid, pos, weight in zip(inputs.masked_lm_ids[i], inputs.masked_lm_positions[i], inputs.masked_lm_weights[i]): if weight == 0: pass else: pos_to_tokid[pos] = tokid text = "" for pos, (tokid, tag, um) in enumerate( zip(inputs.input_ids[i], inputs.tag_ids[i], updates_mask[i])): token = inv_vocab[tokid] if tag == -1: tag = 0 if token == "[PAD]": break if pos in pos_to_tokid: # token = RED + token + " (" + inv_vocab[pos_to_tokid[pos]] + ")" + ENDC token = token + " (" + inv_vocab[pos_to_tokid[pos]] + ")" if provided_update_mask: assert um == 1 else: if provided_update_mask: assert um == 0 # tag_print = GREEN + " _" + NAMES[tag] + "_ " + ENDC tag_print = " _" + NAMES[tag] + "_ " text += token + tag_print + " " utils.log(tokenization.printable_text(text))
def featurize(self, example: QAExample, is_training, log=False, for_eval=False): all_features = [] query_tokens = self._tokenizer.tokenize(example.question_text) if len(query_tokens) > self.config.max_query_length: query_tokens = query_tokens[0:self.config.max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = self._tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) tok_start_position = None tok_end_position = None if is_training and example.is_impossible: tok_start_position = -1 tok_end_position = -1 if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, self._tokenizer, example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = self.config.max_seq_length - len(query_tokens) - 3 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, self.config.doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} token_is_max_context = {} segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len( tokens)] = tok_to_orig_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = self._tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < self.config.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == self.config.max_seq_length assert len(input_mask) == self.config.max_seq_length assert len(segment_ids) == self.config.max_seq_length start_position = None end_position = None if is_training and not example.is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = 0 end_position = 0 else: doc_offset = len(query_tokens) + 2 start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset if is_training and example.is_impossible: start_position = 0 end_position = 0 if log: utils.log("*** Example ***") utils.log("doc_span_index: %s" % doc_span_index) utils.log( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) utils.log("token_to_orig_map: %s" % " ".join([ "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map) ])) utils.log("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) utils.log("input_ids: %s" % " ".join([str(x) for x in input_ids])) utils.log("input_mask: %s" % " ".join([str(x) for x in input_mask])) utils.log("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) if is_training and example.is_impossible: utils.log("impossible example") if is_training and not example.is_impossible: answer_text = " ".join( tokens[start_position:(end_position + 1)]) utils.log("start_position: %d" % start_position) utils.log("end_position: %d" % end_position) utils.log("answer: %s" % (tokenization.printable_text(answer_text))) features = { "task_id": self.config.task_names.index(self.name), self.name + "_eid": (1000 * example.eid) + doc_span_index, "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, } if for_eval: features.update({ self.name + "_doc_span_index": doc_span_index, self.name + "_tokens": tokens, self.name + "_token_to_orig_map": token_to_orig_map, self.name + "_token_is_max_context": token_is_max_context, }) if is_training: features.update({ self.name + "_start_positions": start_position, self.name + "_end_positions": end_position, self.name + "_is_impossible": example.is_impossible }) all_features.append(features) return all_features
def _add_examples(self, examples, example_failures, paragraph, split): paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True if self.name in [ "sacqa", "cmrc2018", "ccks42ee", "ccks42single", "ccks42multi" ]: # for chinese prev_is_chinese = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace or prev_is_chinese or is_chinese_char( c): doc_tokens.append(c) prev_is_chinese = True if is_chinese_char(c) else False else: doc_tokens[-1] += c prev_is_chinese = False prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) else: for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] if "id" in qa else None qid = qa["qid"] if "qid" in qa else None question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if split == "train": if self.v2: is_impossible = qa["is_impossible"] if not is_impossible: if "detected_answers" in qa: # MRQA format answer = qa["detected_answers"][0] answer_offset = answer["char_spans"][0][0] else: # SQuAD format answer = qa["answers"][0] answer_offset = answer["answer_start"] orig_answer_text = answer["text"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] if answer_offset + answer_length - 1 >= len( char_to_word_offset): utils.log("End position is out of document!") example_failures[0] += 1 continue end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. if self.name in [ "sacqa", "cmrc2018", "ccks42ee", "ccks42single", "ccks42multi" ]: # for chinese, no whitespace needed actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = "".join( tokenization.whitespace_tokenize(orig_answer_text)) else: actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) actual_text = actual_text.lower() cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: utils.log( "Could not find answer: '{:}': '{:}' in doc vs. " "'{:}' in provided answer".format( qas_id, tokenization.printable_text(actual_text), tokenization.printable_text( cleaned_answer_text))) example_failures[0] += 1 continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = QAExample(task_name=self.name, eid=len(examples), qas_id=qas_id, qid=qid, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example)
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, output_file): """Loads a data file into a list of `InputBatch`s.""" label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i writer = tf.python_io.TFRecordWriter(output_file) for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) query_feature = get_features(example.query, max_seq_length=max_seq_length, tokenizer=tokenizer) cand1_feature = get_features(example.cand1, max_seq_length=max_seq_length, tokenizer=tokenizer) cand2_feature = get_features(example.cand2, max_seq_length=max_seq_length, tokenizer=tokenizer) cand3_feature = get_features(example.cand3, max_seq_length=max_seq_length, tokenizer=tokenizer) label_id = label_map[example.label] if ex_index < 20: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("query tokens: %s" % " ".join( [tokenization.printable_text(x) for x in query_feature[0]])) tf.logging.info("query input_ids: %s" % " ".join([str(x) for x in query_feature[1]])) tf.logging.info("query input_mask: %s" % " ".join([str(x) for x in query_feature[2]])) tf.logging.info("query segment_ids: %s" % " ".join([str(x) for x in query_feature[3]])) tf.logging.info("candate 1 tokens: %s" % " ".join( [tokenization.printable_text(x) for x in cand1_feature[0]])) tf.logging.info("candate 1 input_ids: %s" % " ".join([str(x) for x in cand1_feature[1]])) tf.logging.info("candate 1 input_mask: %s" % " ".join([str(x) for x in cand1_feature[2]])) tf.logging.info("candate 1 segment_ids: %s" % " ".join([str(x) for x in cand1_feature[3]])) tf.logging.info("candate 2 tokens: %s" % " ".join( [tokenization.printable_text(x) for x in cand2_feature[0]])) tf.logging.info("candate 2 input_ids: %s" % " ".join([str(x) for x in cand2_feature[1]])) tf.logging.info("candate 2 input_mask: %s" % " ".join([str(x) for x in cand2_feature[2]])) tf.logging.info("candate 2 segment_ids: %s" % " ".join([str(x) for x in cand2_feature[3]])) tf.logging.info("candate 3 tokens: %s" % " ".join( [tokenization.printable_text(x) for x in cand3_feature[0]])) tf.logging.info("candate 3 input_ids: %s" % " ".join([str(x) for x in cand3_feature[1]])) tf.logging.info("candate 3 input_mask: %s" % " ".join([str(x) for x in cand3_feature[2]])) tf.logging.info("candate 3 segment_ids: %s" % " ".join([str(x) for x in cand3_feature[3]])) tf.logging.info("\nlabel: %s (id = %d)" % (example.label, label_id)) def create_int_feature(values): feature = tf.train.Feature(int64_list=tf.train.Int64List( value=list(values))) return feature features = collections.OrderedDict() features["input_ids"] = create_int_feature(query_feature[1]) features["input_mask"] = create_int_feature(query_feature[2]) features["segment_ids"] = create_int_feature(query_feature[3]) features["input_ids1"] = create_int_feature(query_feature[1]) features["input_mask1"] = create_int_feature(query_feature[2]) features["segment_ids1"] = create_int_feature(query_feature[3]) features["input_ids2"] = create_int_feature(query_feature[1]) features["input_mask2"] = create_int_feature(query_feature[2]) features["segment_ids2"] = create_int_feature(query_feature[3]) features["input_ids3"] = create_int_feature(query_feature[1]) features["input_mask3"] = create_int_feature(query_feature[2]) features["segment_ids3"] = create_int_feature(query_feature[3]) features["label_ids"] = create_int_feature([label_id]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writer.write(tf_example.SerializeToString())
def convert_single_example(self, ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id) return feature
def featurize(self, example: MQAExample, is_training, log=False, for_eval=False): tokens = [] input_ids = [] segment_ids = [] input_mask = [] question_tokens = self._tokenizer.tokenize(example.question_text) if len(question_tokens) > self.config.max_len1: question_tokens = question_tokens[0: self.config.max_len1] options_tags = sorted(example.options) for op in options_tags: op_info = example.options[op] op_info_tokens = self._tokenizer.tokenize(op_info) if len(op_info_tokens) > self.config.max_len2: op_info_tokens = op_info_tokens[0: self.config.max_len2] for ev in example.evidences[op]: ev_tokens = self._tokenizer.tokenize(ev) ev_max_len = self.config.max_seq_length - len(op_info_tokens) - len(question_tokens) - 4 if len(ev_tokens) > ev_max_len: ev_tokens = ev_tokens[0: ev_max_len] _tokens = [] _segment_ids = [] _tokens.append("[CLS]") _segment_ids.append(0) for t in question_tokens: _tokens.append(t) _segment_ids.append(0) _tokens.append("[SEP]") _segment_ids.append(0) for t in op_info_tokens: _tokens.append(t) _segment_ids.append(0) _tokens.append("[SEP]") _segment_ids.append(0) for t in ev_tokens: _tokens.append(t) _segment_ids.append(1) _tokens.append("[SEP]") _segment_ids.append(1) _input_ids = self._tokenizer.convert_tokens_to_ids(_tokens) _input_mask = [1] * len(_input_ids) while len(_input_ids) < self.config.max_seq_length: _input_ids.append(0) _input_mask.append(0) _segment_ids.append(0) assert len(_input_ids) == self.config.max_seq_length assert len(_input_mask) == self.config.max_seq_length assert len(_segment_ids) == self.config.max_seq_length tokens.append(_tokens) input_ids.append(_input_ids) input_mask.append(_input_mask) segment_ids.append(_segment_ids) # padding for max options number, it may be used for "combination" case. padding_num = self.config.max_options_num - len(options_tags) while padding_num: for _ in range(self.config.evidences_top_k): input_ids.append([0] * self.config.max_seq_length) input_mask.append([0] * self.config.max_seq_length) segment_ids.append([0] * self.config.max_seq_length) padding_num -= 1 answer_ids = None answer_ids_raw = None if example.type == "0": answer_mask = [0] * (2 ** self.config.max_options_num) for i in range(len(options_tags)): answer_mask[2 ** i] = 1 if is_training: answer_ids = [0] * (2 ** self.config.max_options_num) answer_ids_raw = [0] * self.config.max_options_num answer_ids[2 ** (options_tags.index(example.answers[0]))] = 1 answer_ids_raw[options_tags.index(example.answers[0])] = 1 elif example.type == "1": answer_mask = [0] * (2 ** self.config.max_options_num) for _, comb_ops in example.combination_options.items(): index = 0 for comb_op in comb_ops: index += 2 ** (options_tags.index(comb_op)) answer_mask[index] = 1 if is_training: answer_ids = [0] * (2 ** self.config.max_options_num) answer_ids_raw = [0] * self.config.max_options_num index = 0 for comb_op in example.combination_options[example.answers[0]]: index += 2 ** (options_tags.index(comb_op)) answer_ids_raw[options_tags.index(comb_op)] = 1 answer_ids[index] = 1 else: raise Exception("Not implemented for _type not in ('0', '1').") # flat def flat(x): return reduce(lambda a, b: a+b, x) tokens = flat(tokens) input_ids = flat(input_ids) input_mask = flat(input_mask) segment_ids = flat(segment_ids) if log: utils.log("*** Example ***") utils.log("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) utils.log("input_ids: %s" % " ".join([str(x) for x in input_ids])) utils.log("input_mask: %s" % " ".join([str(x) for x in input_mask])) utils.log("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) utils.log("answer: %s" % " ".join([str(x) for x in answer_mask])) if is_training: utils.log("answer: %s" % " ".join([str(x) for x in answer_ids])) features = { "task_id": self.config.task_names.index(self.name), self.name + "_eid": example.eid, "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, self.name + "_answer_mask": answer_mask, } if for_eval: features.update({ self.name + "_options_tags": options_tags, self.name + "_combination_options": example.combination_options, self.name + "_type": example.type, }) if is_training: features.update({ self.name + "_answer_ids": answer_ids, self.name + "_answer_ids_raw": answer_ids_raw, }) return [features]
def write_instance_to_example_files(instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files): """Create TF example files from `TrainingInstance`s.""" writers = [] for output_file in output_files: writers.append(tf.python_io.TFRecordWriter(output_file)) writer_index = 0 total_written = 0 for (inst_index, instance) in enumerate(instances): input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) input_mask = [1] * len(input_ids) segment_ids = list(instance.segment_ids) assert len(input_ids) <= max_seq_length while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length masked_lm_positions = list(instance.masked_lm_positions) masked_lm_ids = tokenizer.convert_tokens_to_ids( instance.masked_lm_labels) masked_lm_weights = [1.0] * len(masked_lm_ids) while len(masked_lm_positions) < max_predictions_per_seq: masked_lm_positions.append(0) masked_lm_ids.append(0) masked_lm_weights.append(0.0) next_sentence_label = 1 if instance.is_random_next else 0 features = collections.OrderedDict() features['input_ids'] = create_int_feature(input_ids) features['input_mask'] = create_int_feature(input_mask) features['segment_ids'] = create_int_feature(segment_ids) features['masked_lm_positions'] = create_int_feature( masked_lm_positions) features['masked_lm_ids'] = create_int_feature(masked_lm_ids) features['masked_lm_weights'] = create_float_feature(masked_lm_weights) features['next_sentence_labels'] = create_int_feature( [next_sentence_label]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writers[writer_index].write(tf_example.SerializeToString()) writer_index = (writer_index + 1) % len(writers) total_written += 1 if inst_index < 20: tf.logging.info('*** Example ***') tf.logging.info('tokens: %s' % ' '.join( [tokenization.printable_text(x) for x in instance.tokens])) for feature_name in features.keys(): feature = features[feature_name] values = [] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value tf.logging.info( '%s: %s' % (feature_name, ' '.join([str(x) for x in values]))) for writer in writers: writer.close() tf.logging.info('Wrote %d total instances', total_written)
def convert_single_pairexample(ex_index, example, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `PairInputFeatures`.""" tokens_a, pos_a = tokenizer.tokenize(example.text_a) tokens_b, pos_b = tokenizer.tokenize(example.text_b) assert len(tokens_a) == len(pos_a) assert len(tokens_b) == len(pos_b) if len(tokens_a) > max_seq_length: tokens_a = tokens_a[0:max_seq_length] pos_a = pos_a[0:max_seq_length] if len(tokens_b) > max_seq_length: tokens_b = tokens_b[0:max_seq_length] pos_b = pos_b[0:max_seq_length] input_ids_a, pos_mask_a = tokenizer.convert_tokens_to_ids(tokens_a, pos_a) input_ids_b, pos_mask_b = tokenizer.convert_tokens_to_ids(tokens_b, pos_b) segment_ids_a = [0] * len(input_ids_a) segment_ids_b = [0] * len(input_ids_b) input_mask_a = [1] * len(input_ids_a) input_mask_b = [1] * len(input_ids_b) # Zero-pad up to the sequence length. while len(input_ids_a) < max_seq_length: input_ids_a.append(0) input_mask_a.append(0) segment_ids_a.append(0) pos_mask_a.append(0) while len(input_ids_b) < max_seq_length: input_ids_b.append(0) input_mask_b.append(0) segment_ids_b.append(0) pos_mask_b.append(0) assert len(input_ids_a) == max_seq_length assert len(input_mask_a) == max_seq_length assert len(segment_ids_a) == max_seq_length assert len(pos_mask_a) == max_seq_length assert len(input_ids_b) == max_seq_length assert len(input_mask_b) == max_seq_length assert len(segment_ids_b) == max_seq_length assert len(pos_mask_b) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens_a: %s" % " ".join([ tokenization.printable_text(x).encode("utf-8").decode( "unicode_escape") for x in tokens_a ])) tf.logging.info("input_ids_a: %s" % " ".join([str(x) for x in input_ids_a])) vocab_tokens_a = tokenizer.convert_ids_to_tokens(input_ids_a) tf.logging.info("ids_token_a: %s" % " ".join(vocab_tokens_a)) tf.logging.info("input_mask_a: %s" % " ".join([str(x) for x in input_mask_a])) tf.logging.info("pos_mask_a:%s" % " ".join([str(x) for x in pos_mask_a])) tf.logging.info("segment_ids_a: %s" % " ".join([str(x) for x in segment_ids_a])) tf.logging.info("tokens_b: %s" % " ".join([ tokenization.printable_text(x).encode("utf-8").decode( "unicode_escape") for x in tokens_b ])) tf.logging.info("input_ids_b: %s" % " ".join([str(x) for x in input_ids_b])) vocab_tokens_b = tokenizer.convert_ids_to_tokens(input_ids_b) tf.logging.info("ids_token_b: %s" % " ".join(vocab_tokens_b)) tf.logging.info("input_mask_b: %s" % " ".join([str(x) for x in input_mask_b])) tf.logging.info("pos_mask_b:%s" % " ".join([str(x) for x in pos_mask_b])) tf.logging.info("segment_ids_b: %s" % " ".join([str(x) for x in segment_ids_b])) tf.logging.info("label: %s" % (str(example.label))) feature = PairInputFeatures(input_ids_a=input_ids_a, input_mask_a=input_mask_a, segment_ids_a=segment_ids_a, input_ids_b=input_ids_b, input_mask_b=input_mask_b, segment_ids_b=segment_ids_b, label=example.label) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a, pos_a = tokenizer.tokenize(example.text_a) tokens_b = None assert len(tokens_a) == len(pos_a) if example.text_b: tokens_b, pos_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" # _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) _truncate_seq_pair(tokens_a, tokens_b, max_seq_length) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length: tokens_a = tokens_a[0:max_seq_length] pos_a = pos_a[0:max_seq_length] tokens = tokens_a #pos_mask = pos_a #segment_ids = [] #for token in tokens_a: # tokens.append(token) # #segment_ids.append(0) if tokens_b: tokens.extend(tokens_b) #for token in tokens_b: # tokens.append(token) # segment_ids.append(1) assert len(tokens_a) == len(pos_a) input_ids, pos_mask = tokenizer.convert_tokens_to_ids(tokens_a, pos_a) segment_ids = [0] * len(input_ids) if tokens_b: input_ids_b, pos_mask_b = tokenizer.convert_tokens_to_ids(tokens_b) segment_ids_b = [1] * len(input_ids_b) input_ids += input_ids_b segment_ids += segment_ids_b pos_mask += pos_mask_b # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) pos_mask.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(pos_mask) == max_seq_length label_id = label_map[example.label] if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) vocab_tokens = tokenizer.convert_ids_to_tokens(input_ids) tf.logging.info("ids_token: %s" % " ".join(vocab_tokens)) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("pos_mask:%s" % " ".join([str(x) for x in pos_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, keyword_mask=pos_mask, segment_ids=segment_ids, label_id=label_id) return feature
def _add_examples(self, examples, example_failures, paragraph, split): paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) # def parse(sentence): # """ 解析一个句子,返回dependence heads etc """ # doc = nlp(sentence) # heads = [] # words = [] # for sent in doc.sentences: # heads_tmp = [] # num_tmp = sum([len(x) if x else 0 for x in heads]) # for word in sent.words: # words.append(word.text) # if word.head == 0: # heads_tmp.append(0) # else: # heads_tmp.append(word.head + num_tmp) # heads.append(heads_tmp) # heads = reduce(lambda x, y: x + y, heads) # return heads, words # # def parse_and_trim(tokens): # """ 输入空格分词后的tokens list, parse后按照输入调整heads """ # heads, words = parse(" ".join(tokens)) # t2w = {} # w2t = {} # ti = 0 # wi = 0 # last_move = None # 交替移动指针的控制 # while (ti < len(tokens)) and (wi < len(words)): # if tokens[ti] == words[wi]: # t2w[ti] = wi # w2t[wi] = ti # ti += 1 # wi += 1 # last_move = None # elif tokens[ti] in words[wi]: # t2w[ti] = wi # if wi not in w2t: # w2t[wi] = ti # ti += 1 # last_move = 't' # elif words[wi] in tokens[ti]: # w2t[wi] = ti # if ti not in t2w: # t2w[ti] = wi # wi += 1 # last_move = 'w' # else: # if last_move == 'w': # ti += 1 # last_move = 't' # elif last_move == 't': # wi += 1 # last_move = 'w' # else: # wi += 1 # ti += 1 # last_move = None # heads_ = [] # for ti in range(len(tokens)): # wi = t2w.get(ti, None) # if wi is not None: # h = heads[wi] # if h == 0: # heads_.append(0) # else: # h_ = w2t.get(h - 1, None) # if h_ is not None: # heads_.append(h_ + 1) # else: # heads_.append(ti + 1) # else: # heads_.append(ti + 1) # return heads_ # # def heads_2_dep_matrix(heads): # """ 将dependence heads转换为dependence matrix """ # arr = np.diag((1,) * len(heads)) # for i, j in enumerate(heads): # if j != 0: # arr[i, j - 1] = 1 # while True: # 传递依赖 # arr1 = np.matmul(arr, arr) # arr1[arr1 > 1] = 1 # if (arr1 == arr).all(): # break # else: # arr = arr1 # return arr tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] # heads = parse_and_trim(doc_tokens) # dependence heads for (i, token) in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = self._tokenizer.tokenize(token) for j, sub_token in enumerate(sub_tokens): tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) # heads_piece = [] # last_orig_index = None # for ind in range(len(all_doc_tokens)): # orig_index = tok_to_orig_index[ind] # if orig_index == last_orig_index: # heads_piece.append(ind) # else: # h = heads[orig_index] # if h == 0: # heads_piece.append(0) # else: # heads_piece.append(orig_to_tok_index[h - 1] + 1) # last_orig_index = orig_index # all_doc_tokens_dep_mask = heads_2_dep_matrix(heads_piece) for qa in paragraph["qas"]: qas_id = qa["id"] if "id" in qa else None qid = qa["qid"] if "qid" in qa else None question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False plau_answer_text = plau_answer_start_w = plau_answer_end_w = None if split == "train": if self.v2: is_impossible = qa["is_impossible"] if not is_impossible: if "detected_answers" in qa: # MRQA format answer = qa["detected_answers"][0] answer_offset = answer["char_spans"][0][0] else: # SQuAD format answer = qa["answers"][0] answer_offset = answer["answer_start"] orig_answer_text = answer["text"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] if answer_offset + answer_length - 1 >= len( char_to_word_offset): utils.log("End position is out of document!") example_failures[0] += 1 continue end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) actual_text = actual_text.lower() cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: utils.log("Could not find answer: '{:}' in doc vs. " "'{:}' in provided answer".format( tokenization.printable_text(actual_text), tokenization.printable_text( cleaned_answer_text))) example_failures[0] += 1 continue else: start_position = -1 end_position = -1 orig_answer_text = "" plausible_answers = qa.get("plausible_answers", None) if plausible_answers: plau_answer_text = plausible_answers[0]["text"] plau_answer_start = plausible_answers[0][ "answer_start"] plau_answer_length = len(plau_answer_text) if plau_answer_start + plau_answer_length - 1 >= len( char_to_word_offset): tf.logging.waring("plausible answer error, pass.") plau_answer_text = plau_answer_start_w = plau_answer_end_w = None else: plau_answer_start_w = char_to_word_offset[ plau_answer_start] plau_answer_end_w = char_to_word_offset[ plau_answer_start + plau_answer_length - 1] actual_text = " ".join( doc_tokens[plau_answer_start_w:( plau_answer_end_w + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize( plau_answer_text)) actual_text = actual_text.lower() cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: tf.logging.waring( "plausible answer error, pass.") plau_answer_text = plau_answer_start_w = plau_answer_end_w = None example = QAExample( task_name=self.name, eid=len(examples), qas_id=qas_id, qid=qid, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible, all_doc_tokens=all_doc_tokens, orig_to_tok_index=orig_to_tok_index, tok_to_orig_index=tok_to_orig_index, # all_doc_tokens_dep_mask=all_doc_tokens_dep_mask, plau_answer_start=plau_answer_start_w, plau_answer_text=plau_answer_text, plau_answer_end=plau_answer_end_w, ) examples.append(example)
def featurize(self, example: InputExample, is_training, log=False): """Turn an InputExample into a dict of features.""" tokens_a = self._tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = self._tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, self.config.max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self.config.max_seq_length - 2: tokens_a = tokens_a[0:(self.config.max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it # makes it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = self._tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < self.config.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == self.config.max_seq_length assert len(input_mask) == self.config.max_seq_length assert len(segment_ids) == self.config.max_seq_length if log: utils.log(" Example {:}".format(example.eid)) utils.log(" tokens: {:}".format(" ".join( [tokenization.printable_text(x) for x in tokens]))) utils.log(" input_ids: {:}".format(" ".join(map(str, input_ids)))) utils.log(" input_mask: {:}".format(" ".join( map(str, input_mask)))) utils.log(" segment_ids: {:}".format(" ".join( map(str, segment_ids)))) eid = example.eid features = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "task_id": self.config.task_names.index(self.name), self.name + "_eid": eid, } self._add_features(features, example, log) return features
def _add_examples(self, examples, example_failures, paragraph, split): paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = self._tokenizer.tokenize(token) for j, sub_token in enumerate(sub_tokens): tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) for qa in paragraph["qas"]: qas_id = qa["id"] if "id" in qa else None qid = qa["qid"] if "qid" in qa else None question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False plau_answer_text = plau_answer_start_w = plau_answer_end_w = None if split == "train": if self.v2: is_impossible = qa["is_impossible"] if not is_impossible: if "detected_answers" in qa: # MRQA format answer = qa["detected_answers"][0] answer_offset = answer["char_spans"][0][0] else: # SQuAD format answer = qa["answers"][0] answer_offset = answer["answer_start"] orig_answer_text = answer["text"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] if answer_offset + answer_length - 1 >= len( char_to_word_offset): utils.log("End position is out of document!") example_failures[0] += 1 continue end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) actual_text = actual_text.lower() cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: utils.log("Could not find answer: '{:}' in doc vs. " "'{:}' in provided answer".format( tokenization.printable_text(actual_text), tokenization.printable_text( cleaned_answer_text))) example_failures[0] += 1 continue else: start_position = -1 end_position = -1 orig_answer_text = "" plausible_answers = qa.get("plausible_answers", None) if plausible_answers: plau_answer_text = plausible_answers[0]["text"] plau_answer_start = plausible_answers[0][ "answer_start"] plau_answer_length = len(plau_answer_text) if plau_answer_start + plau_answer_length - 1 >= len( char_to_word_offset): tf.logging.warning("plausible answer error, pass.") plau_answer_text = plau_answer_start_w = plau_answer_end_w = None else: plau_answer_start_w = char_to_word_offset[ plau_answer_start] plau_answer_end_w = char_to_word_offset[ plau_answer_start + plau_answer_length - 1] actual_text = " ".join( doc_tokens[plau_answer_start_w:( plau_answer_end_w + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize( plau_answer_text)) actual_text = actual_text.lower() cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: tf.logging.warning( "plausible answer error, pass.") plau_answer_text = plau_answer_start_w = plau_answer_end_w = None example = QAExample( task_name=self.name, eid=len(examples), qas_id=qas_id, qid=qid, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible, all_doc_tokens=all_doc_tokens, orig_to_tok_index=orig_to_tok_index, tok_to_orig_index=tok_to_orig_index, plau_answer_start=plau_answer_start_w, plau_answer_text=plau_answer_text, plau_answer_end=plau_answer_end_w, ) examples.append(example)