def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) return s
def __str__(self): s = "" s += "tokens: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.tokens])) s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) s += "is_random_next: %s\n" % self.is_random_next s += "masked_lm_positions: %s\n" % (" ".join( [str(x) for x in self.masked_lm_positions])) s += "masked_lm_labels: %s\n" % (" ".join( [tokenization.printable_text(x) for x in self.masked_lm_labels])) s += "\n" return s
def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % ( tokenization.printable_text(self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) if self.history_answer_marker: s += ', history_answer_marker: {}'.format(json.dumps(self.history_answer_marker)) if self.metadata: s += ', metadata: ' + json.dumps(self.metadata) return s
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" labellist = example.label.split(' ') #注意这里从1开始,因为句子长度不够时要进行填充其对应的label填充为0 label_map = {} for (i, label) in enumerate(label_list,1): label_map[label] = i #样本向字id的转换 if len(example.text_a)>max_seq_length-2: example.text_a=example.text_a[0:(max_seq_length-2)] labellist=labellist[0:(max_seq_length-2)] tokens_a = tokenizer.tokenize(example.text_a) # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. #若输入的是句子对要加入特殊字符 tokens = [] segment_ids = [] label_ids=[] tokens.append("[CLS]") segment_ids.append(0) label_ids.append(label_map["[CLS]"]) for i,token in enumerate(tokens_a): tokens.append(token) segment_ids.append(0) label_ids.append(label_map[labellist[i]]) tokens.append("[SEP]") segment_ids.append(0) label_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # we don't concerned about it! label_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) #创建InputFeatures的一个实例化对象并返回n feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids) return feature
def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, output_fn): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 for (example_index, example) in enumerate(examples): query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) tok_start_position = None tok_end_position = None if is_training and example.is_impossible: tok_start_position = -1 tok_end_position = -1 if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} token_is_max_context = {} segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length start_position = None end_position = None if is_training and not example.is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = 0 end_position = 0 else: doc_offset = len(query_tokens) + 2 start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset if is_training and example.is_impossible: start_position = 0 end_position = 0 if example_index < 20: tf.logging.info("*** Example ***") tf.logging.info("unique_id: %s" % (unique_id)) tf.logging.info("example_index: %s" % (example_index)) tf.logging.info("doc_span_index: %s" % (doc_span_index)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("token_to_orig_map: %s" % " ".join( ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])) tf.logging.info("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info( "input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info( "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) if is_training and example.is_impossible: tf.logging.info("impossible example") if is_training and not example.is_impossible: answer_text = " ".join(tokens[start_position:(end_position + 1)]) tf.logging.info("start_position: %d" % (start_position)) tf.logging.info("end_position: %d" % (end_position)) tf.logging.info( "answer: %s" % (tokenization.printable_text(answer_text))) feature = InputFeatures( unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, tokens=tokens, token_to_orig_map=token_to_orig_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, start_position=start_position, end_position=end_position, is_impossible=example.is_impossible) # Run callback output_fn(feature) unique_id += 1
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode): ######################## important label_map = {} for (i, label) in enumerate(label_list, 1): label_map[label] = i label2idpath = './output/label2id.pkl' if not os.path.exists(label2idpath): with open(label2idpath, 'wb') as w: pickle.dump(label_map, w) textlist = list(example.text) labellist = list(example.label) tokens = [] labels = [] unknow_index = [] #记录下标记为[UNK]在textlist当中的位置,用于还原 for i, word in enumerate(textlist): token = tokenizer.tokenize(word) tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) else: labels.append("X") if token[m] == "[UNK]": unknow_index.append(i) assert len(tokens) == len(labels) if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") segment_ids.append(0) label_ids.append(label_map["[CLS]"]) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) ntokens.append("[SEP]") segment_ids.append(0) label_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) ntokens.append("**NULL**") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, ) # 还原[UNK]的数据 output_tokens = [] for i, each in enumerate(ntokens): if each != "[UNK]": output_tokens.append(each) else: index = unknow_index[0] output_tokens.append(textlist[index]) unknow_index = unknow_index[1:] write_tokens(output_tokens, mode) return feature #InputFeature实例
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures(input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, is_real_example=False) label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) return feature
def convert_single_example(ex_index, example, label_map, max_seq_length, tokenizer, mode): textlist = example.text.split(' ') labellist = example.label.split(' ') tokens = [] labels = [] # print(textlist) for i, word in enumerate(textlist): token = tokenizer.tokenize(word) # print(token) tokens.extend(token) label_1 = labellist[i] # print(label_1) for m in range(len(token)): if m == 0: labels.append(label_1) else: labels.append("X") # print(tokens, labels) # tokens = tokenizer.tokenize(example.text) if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") segment_ids.append(0) # append("O") or append("[CLS]") not sure! label_ids.append(label_map["[CLS]"]) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) ntokens.append("[SEP]") segment_ids.append(0) # append("O") or append("[SEP]") not sure! label_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) # label_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # we don't concerned about it! label_ids.append(0) ntokens.append("**NULL**") # label_mask.append(0) # print(len(input_ids)) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length # assert len(label_mask) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, # label_mask = label_mask ) write_tokens(ntokens, mode) return feature
def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, return_answers, skip_no_answer, verbose=False, save_with_prob=False, msg="Converting examples"): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 features = [] question_features = [] for (example_index, example) in enumerate(tqdm(examples, desc=msg)): # Tokenize query into (sub)tokens query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] # Creating a map between word <=> (sub)token tok_to_word_index = [] word_to_tok_index = [] # word to (start of) subtokens all_doc_tokens = [] for (i, word) in enumerate(example.doc_words): word_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(word) for sub_token in sub_tokens: tok_to_word_index.append(i) all_doc_tokens.append(sub_token) # The -2 accounts for [CLS], [SEP] max_tokens_for_doc = max_seq_length - 2 # Split sequence by max_seq_len with doc_stride, _DocSpan is based on tokens without [CLS], [SEP] _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_tok_offset = 0 # From all_doc_tokens # Get doc_spans with stride and offset while start_tok_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_tok_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_tok_offset, length=length)) if start_tok_offset + length == len(all_doc_tokens): break start_tok_offset += min( length, doc_stride) # seems to prefer doc_stride always assert doc_stride < length, "length is no larger than doc_stride for {}".format( doc_spans) # Iterate each doc_span and make out_tokens for (doc_span_index, doc_span) in enumerate(doc_spans): # Find answer position based on new out_tokens start_position = None end_position = None # For no_answer, same (-1, -1) applies if example.start_position is not None and example.start_position < 0: assert example.start_position == -1 and example.end_position == -1 start_position, end_position = NO_ANS, NO_ANS # For existing answers, find answers if exist elif return_answers: # Get token-level start/end position tok_start_position = word_to_tok_index[example.start_position] if example.end_position < len(example.doc_words) - 1: tok_end_position = word_to_tok_index[ example.end_position + 1] - 1 # By backwarding from next word else: assert example.end_position == len(example.doc_words) - 1 tok_end_position = len(all_doc_tokens) - 1 # Improve answer span by subword-level (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text) # Throw away training samples without answers (due to doc_span split) doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 if (tok_start_position < doc_start or tok_end_position < doc_start or tok_start_position > doc_end or tok_end_position > doc_end): if skip_no_answer: continue else: # For NQ, only add this in 2% (50 times downsample) if save_with_prob: if np.random.randint(100) < 2: start_position, end_position = NO_ANS, NO_ANS else: continue else: start_position, end_position = NO_ANS, NO_ANS # Training samples with answers else: doc_offset = 1 # For [CLS] start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset assert start_position >= 0 and end_position >= 0, ( start_position, end_position) out_tokens = [] # doc out_tokens_ = [] # quesry out_tokens.append("[CLS]") out_tokens_.append("[CLS]") token_to_word_map = { } # The difference with tok_to_word_index is it includes special tokens token_is_max_context = {} # For query tokens, just copy and add [SEP] for token in query_tokens: out_tokens_.append(token) out_tokens_.append("[SEP]") # For each doc token, create token_to_word_map and is_max_context, and add to out_tokens for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_word_map[len( out_tokens)] = tok_to_word_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(out_tokens)] = is_max_context out_tokens.append(all_doc_tokens[split_token_index]) out_tokens.append("[SEP]") # Convert to ids and masks input_ids = tokenizer.convert_tokens_to_ids(out_tokens) input_ids_ = tokenizer.convert_tokens_to_ids(out_tokens_) input_mask = [1] * len(input_ids) input_mask_ = [1] * len(input_ids_) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length while len( input_ids_) < max_query_length + 2: # +2 for [CLS], [SEP] input_ids_.append(0) input_mask_.append(0) assert len(input_ids_) == max_query_length + 2 assert len(input_mask_) == max_query_length + 2 # Printing for debug if example_index < 1 and verbose: logger.info("*** Example ***") logger.info("unique_id: %s" % (unique_id)) logger.info("example_index: %s" % (example_index)) logger.info("doc_span_index: %s" % (doc_span_index)) logger.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in out_tokens])) logger.info("q tokens: %s" % " ".join( [tokenization.printable_text(x) for x in out_tokens_])) logger.info("token_to_word_map: %s" % " ".join([ "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_word_map) ])) logger.info("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) if return_answers: answer_text = " ".join( out_tokens[start_position:(end_position + 1)]) logger.info("start_position: %d" % (start_position)) logger.info("end_position: %d" % (end_position)) logger.info("answer: %s" % (tokenization.printable_text(answer_text))) # Append feature features.append( ContextFeatures(unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, tokens=out_tokens, token_to_word_map=token_to_word_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask, start_position=start_position, end_position=end_position)) question_features.append( QuestionFeatures(unique_id=unique_id, example_index=example_index, tokens_=out_tokens_, input_ids=input_ids_, input_mask=input_mask_)) # Check validity of answer if return_answers: if start_position <= NO_ANS: assert start_position == NO_ANS and end_position == NO_ANS, ( start_position, end_position) else: assert out_tokens[start_position:end_position+1] == \ all_doc_tokens[tok_start_position:tok_end_position+1] orig_text, start_pos, end_pos = get_final_text_( example, features[-1], start_position, end_position, True, False) phrase = orig_text[start_pos:end_pos] try: assert phrase == example.orig_answer_text except Exception as e: # print('diff ans [%s]/[%s]'%(phrase, example.orig_answer_text)) pass unique_id += 1 return features, question_features
def write_instance_to_example_files(instances, tokenizer, max_seq_length, max_predictions_per_seq, outputs, log_fn, report_fn): '''Create TF example files from `TrainingInstance`s''' tf_examples = [] writers = [] for _output in outputs: writers.append(tf.io.TFRecordWriter(_output)) writer_index = 0 total_written = 0 for (inst_index, instance) in enumerate(instances): input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) input_mask = [1] * len(input_ids) segment_ids = list(instance.segment_ids) assert len(input_ids) <= max_seq_length while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length masked_lm_positions = list(instance.masked_lm_positions) masked_lm_ids = tokenizer.convert_tokens_to_ids( instance.masked_lm_labels) masked_lm_weights = [1.0] * len(masked_lm_ids) while len(masked_lm_positions) < max_predictions_per_seq: masked_lm_positions.append(0) masked_lm_ids.append(0) masked_lm_weights.append(0.0) next_sentence_label = 1 if instance.is_random_next else 0 features = collections.OrderedDict() features['input_ids'] = create_int_feature(input_ids) features['input_mask'] = create_int_feature(input_mask) features['segment_ids'] = create_int_feature(segment_ids) features['masked_lm_positions'] = create_int_feature( masked_lm_positions) features['masked_lm_ids'] = create_int_feature(masked_lm_ids) features['masked_lm_weights'] = create_float_feature(masked_lm_weights) features['next_sentence_labels'] = create_int_feature( [next_sentence_label]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) tf_examples.append(tf_example) writers[writer_index].write(tf_example.SerializeToString()) writer_index = (writer_index + 1) % len(writers) total_written += 1 # DEMO if inst_index < constants.INSTANCE_DEMO_SIZE: log_fn('### Example') log_fn('tokens: {}'.format(' '.join( [tokenization.printable_text(x) for x in instance.tokens]))) for feature_name in features.keys(): feature = features[feature_name] values = [] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value log_fn('{}: {}'.format(feature_name, ' '.join([str(x) for x in values]))) for writer in writers: writer.close() log_fn('### Generate TF example') log_fn('# {} instances'.format(total_written)) report_fn('[INFO] TF example: {} instances'.format(total_written)) return tf_examples
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode): label_map = {} for (i, label) in enumerate(label_list, 1): label_map[label] = i with open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'wb') as w: pickle.dump(label_map, w) textlist = example.text.split(' ') labellist = example.label.split(' ') gztrs_list = example.gazetteer.split(' ') tokens = [] labels = [] gazetteers = [] for i, word in enumerate(textlist): token = tokenizer.tokenize(word) tokens.extend(token) label_1 = labellist[i] gztr = gztrs_list[i] for m in range(len(token)): if token[m] == '': assert False == True if m == 0: labels.append(label_1) gazetteers.append(gztr) else: labels.append("X") gazetteers.append("X") if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] gazetteers = gazetteers[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] gazetteer_ids = [] ntokens.append("[CLS]") segment_ids.append(0) label_ids.append(label_map["[CLS]"]) gazetteer_ids.append(label_map["[CLS]"]) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) gazetteer_ids.append(label_map[gazetteers[i]]) ntokens.append("[SEP]") segment_ids.append(0) label_ids.append(label_map["[SEP]"]) gazetteer_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) length = min(FLAGS.max_seq_length, len(label_ids)) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) gazetteer_ids.append(0) ntokens.append("**NULL**") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, gazetteer=gazetteer_ids, length=length) write_tokens(ntokens, mode) return feature
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, output_file): """Loads a data file into a list of `InputBatch`s.""" label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i writer = tf.python_io.TFRecordWriter(output_file) for (ex_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info( "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) def create_int_feature(values): feature = tf.train.Feature( int64_list=tf.train.Int64List(value=list(values))) return feature features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) features["label_ids"] = create_int_feature([label_id]) tf_example = tf.train.Example(features=tf.train.Features(feature=features)) writer.write(tf_example.SerializeToString())
def predict_from_model(self, task): ans = [] text = task["text"] text = text.replace("?", ".") text = text.replace("!", ".") text = text.replace("<…>", "[MASK]") text = text.replace("<...>", "[MASK]") text = text.replace("...", "[MASK]") text = text.replace("…", "[MASK]") key = " " for var in self.option: if isinstance(var, list): if var[0] in text and var[1] in text: key = var[0] + var[1] break else: if var in text: key = var break sentences = text.split('.') second_sen = "" cnt = 0 for sen in sentences: if "[MASK]" in sen: second_sen = sen[4:] break cnt += 1 first_sen = sentences[cnt - 1][4:] first_sen += '.' second_sen += '.' sentence = first_sen + ' ' + second_sen sentence = sentence.replace(' [MASK] ', '[MASK]') sentence = sentence.replace('[MASK] ', '[MASK]') sentence = sentence.replace(' [MASK]', '[MASK]') # удаляем лишние пробелы sentence = sentence.split('[MASK]') tokens = ['[CLS]'] upper_case = False for i in range(len(sentence)): if i == 0: tokens = tokens + self.tokenizer.tokenize(sentence[i]) else: if tokens[-1] == '.': upper_case = True tokens = tokens + ['[MASK]'] + self.tokenizer.tokenize( sentence[i]) tokens = tokens + ['[SEP]'] token_input = self.tokenizer.convert_tokens_to_ids(tokens) token_input = token_input + [0] * (512 - len(token_input)) mask_input = [0] * 512 for i in range(len(mask_input)): if token_input[i] == 103: mask_input[i] = 1 seg_input = [0] * 512 token_input = np.asarray([token_input]) mask_input = np.asarray([mask_input]) seg_input = np.asarray([seg_input]) predicts = self.model.predict([token_input, seg_input, mask_input]) predicts = predicts[0] if key == " ": vals = np.amax(predicts, axis=-1) predicts = np.argmax(predicts, axis=-1) predicts = predicts[0][:len(tokens)] out = [] for i in range(len(mask_input[0])): if mask_input[0][ i] == 1: # [0][i], т.к. сеть возвращает batch с формой (1,512), где в первом элементе наш результат out.append(predicts[i]) out = self.tokenizer.convert_ids_to_tokens( out) # индексы в текстовые токены out = ' '.join(out) # объединяем токены в строку с пробелами out = tokenization.printable_text(out) # в удобочитаемый текст out = out.replace(' ##', '') return out.lower() else: word_list = self.option_to_list[key] new_word_list = [] if upper_case: for word in word_list: new_word_list.append(word[0].upper() + word[1:]) else: new_word_list = word_list #print(new_word_list) id_word_list = self.tokenizer.convert_tokens_to_ids(new_word_list) ID_prob = [] for i in range(len(mask_input[0])): if mask_input[0][i] == 1: for ID in id_word_list: ID_prob.append([ predicts[0][i][ID], self.tokenizer.convert_ids_to_tokens([ID]) ]) ID_prob = sorted(ID_prob, key=lambda x: x[0], reverse=True) return ID_prob[0][1][0].lower()
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s.""" label_map = dict() for label in label_list: label_map[label] = len(label_map) reverse_label_map = dict([(v,k) for (k,v) in label_map.items()]) max_len_in_data = 0 features = [] for (ex_index, example) in enumerate(examples): tokens_a, tokens_map_a = tokenizer.tokenize_with_map(example.text_a) example.text_a_map = tokens_map_a if len(tokens_a) > max_len_in_data: max_len_in_data = len(tokens_a) max_len_in_data_tokens = tokens_a tokens_b = None if example.text_b: tokens_b, tokens_map_b = tokenizer.tokenize_with_map(example.text_b) example.text_b_map = tokens_map_b if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # Account for [SEP] with "-1" # if len(tokens_a) > max_seq_length - 1: # tokens_a = tokens_a[0:(max_seq_length - 1)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambigiously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] # try to not add [CLS] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) label_ids = [label_map["[CLS]"]] # `[CLS]' symbol # label_ids = [] # no `[CLS]' symbol for ori_pos in tokens_map_a: t_l = example.label[ori_pos] label_ids.append(label_map[t_l]) if len(label_ids) == len(input_ids) - 1: # exclude last [SEP] break label_ids.append(label_map["[SEP]"]) # `[SEP]' symbol assert len(label_ids) == len(input_ids), "Label and sent len diff: {} and {}".format(len(label_ids), len(input_ids)) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length if ex_index < 3: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info( "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) logger.info("labels: %s (id = %s)" % (' '.join([reverse_label_map[ll] for ll in label_ids]), label_ids)) features.append( InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_ids)) logger.info("Max length in data {}: {}".format(max_len_in_data, ' '.join(max_len_in_data_tokens))) return features
def convert_examples_to_features(examples, seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s.""" features = [] for (ex_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > seq_length - 2: tokens_a = tokens_a[0:(seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] input_type_ids = [] tokens.append("[CLS]") input_type_ids.append(0) for token in tokens_a: tokens.append(token) input_type_ids.append(0) tokens.append("[SEP]") input_type_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) input_type_ids.append(1) tokens.append("[SEP]") input_type_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < seq_length: input_ids.append(0) input_mask.append(0) input_type_ids.append(0) assert len(input_ids) == seq_length assert len(input_mask) == seq_length assert len(input_type_ids) == seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("unique_id: %s" % (example.unique_id)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info( "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) features.append( InputFeatures( unique_id=example.unique_id, tokens=tokens, input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids)) return features
def write_instance_to_example_files(instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files): """Create TF example files from `TrainingInstance`s.""" writers = [] for output_file in output_files: writers.append(tf.python_io.TFRecordWriter(output_file)) writer_index = 0 total_written = 0 for (inst_index, instance) in enumerate(instances): input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) input_mask = [1] * len(input_ids) segment_ids = list(instance.segment_ids) assert len(input_ids) <= max_seq_length #把每一个句子都弄成一样长的.如果不到max_seq_length就补0. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length masked_lm_positions = list(instance.masked_lm_positions) masked_lm_ids = tokenizer.convert_tokens_to_ids( instance.masked_lm_labels) masked_lm_weights = [1.0] * len(masked_lm_ids) while len(masked_lm_positions) < max_predictions_per_seq: masked_lm_positions.append(0) masked_lm_ids.append(0) masked_lm_weights.append(0.0) next_sentence_label = 1 if instance.is_random_next else 0 features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) features["masked_lm_positions"] = create_int_feature( masked_lm_positions) features["masked_lm_ids"] = create_int_feature(masked_lm_ids) features["masked_lm_weights"] = create_float_feature(masked_lm_weights) features["next_sentence_labels"] = create_int_feature( [next_sentence_label]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writers[writer_index].write(tf_example.SerializeToString()) writer_index = (writer_index + 1) % len(writers) total_written += 1 if inst_index < 20: tf.logging.info("*** Example ***") tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in instance.tokens])) for feature_name in features.keys(): feature = features[feature_name] values = [] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value tf.logging.info( "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) for writer in writers: writer.close() tf.logging.info("Wrote %d total instances", total_written)
def convert_documents_to_features(examples, tokenizer, max_seq_length, doc_stride): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 features = [] for (example_index, example) in enumerate(tqdm(examples, desc='Converting documents')): # Creating a map between word <=> (sub)token tok_to_word_index = [] word_to_tok_index = [] # word to (start of) subtokens all_doc_tokens = [] for (i, word) in enumerate(example.doc_words): word_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(word) for sub_token in sub_tokens: tok_to_word_index.append(i) all_doc_tokens.append(sub_token) # The -2 accounts for [CLS], [SEP] max_tokens_for_doc = max_seq_length - 2 # Split sequence by max_seq_len with doc_stride, _DocSpan is based on tokens without [CLS], [SEP] _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_tok_offset = 0 # From all_doc_tokens # Get doc_spans with stride and offset while start_tok_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_tok_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_tok_offset, length=length)) if start_tok_offset + length == len(all_doc_tokens): break start_tok_offset += min( length, doc_stride) # seems to prefer doc_stride always assert doc_stride < length, "length is no larger than doc_stride for {}".format( doc_spans) # Iterate each doc_span and make out_tokens for (doc_span_index, doc_span) in enumerate(doc_spans): out_tokens = [] # doc out_tokens.append("[CLS]") token_to_word_map = { } # The difference with tok_to_word_index is it includes special tokens token_is_max_context = {} # For each doc token, create token_to_word_map and is_max_context, and add to out_tokens for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_word_map[len( out_tokens)] = tok_to_word_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(out_tokens)] = is_max_context out_tokens.append(all_doc_tokens[split_token_index]) out_tokens.append("[SEP]") # Convert to ids and masks input_ids = tokenizer.convert_tokens_to_ids(out_tokens) input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length # Printing for debug if example_index < 1 and doc_span_index < 1: logger.info("*** Example ***") logger.info("unique_id: %s" % (unique_id)) logger.info("example_index: %s" % (example_index)) logger.info("doc_span_index: %s" % (doc_span_index)) logger.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in out_tokens])) logger.info("token_to_word_map: %s" % " ".join([ "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_word_map) ])) logger.info("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) # Append feature features.append( ContextFeatures(unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, tokens=out_tokens, token_to_word_map=token_to_word_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask)) unique_id += 1 return features
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode): label_map = {} for (i, label) in enumerate(label_list, 1): label_map[label] = i with open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'wb') as w: pickle.dump(label_map, w) textlist = example.text.split(' ') labellist = example.label.split(' ') tokens = [] labels = [] orig_to_tok = [] for i, word in enumerate(textlist): token = tokenizer.tokenize(word) orig_to_tok.append(len(tokens) + 1) # +1 for CLS tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) else: labels.append("X") # tokens = tokenizer.tokenize(example.text) if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] my_labels = [] ntokens.append("[CLS]") segment_ids.append(0) # append("O") or append("[CLS]") not sure! label_ids.append(label_map["[CLS]"]) my_labels.append("[CLS]") for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) my_labels.append(labels[i]) ntokens.append("[SEP]") segment_ids.append(0) # append("O") or append("[SEP]") not sure! label_ids.append(label_map["[SEP]"]) my_labels.append("[SEP]") input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) #label_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # we don't concerned about it! label_ids.append(0) my_labels.append(0) ntokens.append("**NULL**") #label_mask.append(0) # print(len(input_ids)) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length #assert len(label_mask) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) #tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, #label_mask = label_mask ) #write_tokens(ntokens,mode) my_write_tokens(ntokens, my_labels, mode) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `RaceExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): four_options = [] for i in range(len(label_list)): option = Option(input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length) four_options.append(option) return InputFeature(four_options=four_options, label_id=0, is_real_example=False) label_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3} tokens_article = tokenizer.tokenize(example.article) tokens_question = tokenizer.tokenize(example.question) four_options = [] for option in example.four_options: tokens_option = tokenizer.tokenize(option) max_article_length = max_seq_length - len(tokens_question) - len( tokens_option) - 4 tokens_article_temp = tokens_article[:max_article_length] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_article_temp: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for token in tokens_question: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) for token in tokens_option: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 1: tf.logging.info("*** Example ***") tf.logging.info("id: %s" % example.id) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) token = Option(input_ids, input_mask, segment_ids) four_options.append(token) feature = InputFeature(four_options=four_options, label_id=label_id, is_real_example=True) return feature
def write_instance_to_example_files(instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files): """Create TF example files from `TrainingInstance`s.""" writers = [] for output_file in output_files: writers.append(tf.io.TFRecordWriter(output_file)) writer_index = 0 total_written = 0 for (inst_index, instance) in enumerate(instances): input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) input_mask = [1] * len(input_ids) segment_ids = list(instance.segment_ids) token_boundary = list(instance.token_boundary) assert len(input_ids) <= max_seq_length while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) token_boundary.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length masked_lm_positions = list(instance.masked_lm_positions) masked_lm_ids = tokenizer.convert_tokens_to_ids( instance.masked_lm_labels) masked_lm_weights = [1.0] * len(masked_lm_ids) multiplier = 1 + int(FLAGS.do_permutation) while len(masked_lm_positions) < max_predictions_per_seq * multiplier: masked_lm_positions.append(0) masked_lm_ids.append(0) masked_lm_weights.append(0.0) sentence_order_label = 1 if instance.is_random_next else 0 features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) features["token_boundary"] = create_int_feature(token_boundary) features["masked_lm_positions"] = create_int_feature( masked_lm_positions) features["masked_lm_ids"] = create_int_feature(masked_lm_ids) features["masked_lm_weights"] = create_float_feature(masked_lm_weights) # Note: We keep this feature name `next_sentence_labels` to be compatible # with the original data created by lanzhzh@. However, in the ALBERT case # it does contain sentence_order_label. features["next_sentence_labels"] = create_int_feature( [sentence_order_label]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writers[writer_index].write(tf_example.SerializeToString()) writer_index = (writer_index + 1) % len(writers) total_written += 1 if inst_index < 20: logging.info("*** Example ***") logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in instance.tokens])) for feature_name in features.keys(): feature = features[feature_name] values = [] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value logging.info("%s: %s" % (feature_name, " ".join([str(x) for x in values]))) for writer in writers: writer.close() meta_data = { "task_type": "albert_pretraining", "train_data_size": total_written, "max_seq_length": max_seq_length, "max_predictions_per_seq": FLAGS.max_predictions_per_seq } with tf.io.gfile.GFile(FLAGS.meta_data_file_path, "w") as writer: writer.write(json.dumps(meta_data, indent=4) + "\n") logging.info("Wrote %d total instances", total_written)
def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 features = [] for (example_index, example) in enumerate(examples): query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) tok_start_position = None tok_end_position = None if is_training: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} token_is_max_context = {} segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length start_position = None end_position = None if is_training: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 if (example.start_position < doc_start or example.end_position < doc_start or example.start_position > doc_end or example.end_position > doc_end): continue doc_offset = len(query_tokens) + 2 start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset if example_index < 20: tf.logging.info("*** Example ***") tf.logging.info("unique_id: %s" % (unique_id)) tf.logging.info("example_index: %s" % (example_index)) tf.logging.info("doc_span_index: %s" % (doc_span_index)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("token_to_orig_map: %s" % " ".join( ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])) tf.logging.info("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info( "input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info( "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) if is_training: answer_text = " ".join(tokens[start_position:(end_position + 1)]) tf.logging.info("start_position: %d" % (start_position)) tf.logging.info("end_position: %d" % (end_position)) tf.logging.info( "answer: %s" % (tokenization.printable_text(answer_text))) features.append( InputFeatures( unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, tokens=tokens, token_to_orig_map=token_to_orig_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, start_position=start_position, end_position=end_position)) unique_id += 1 return features
def convert_single_example(ex_index, example, slot_list, class_types, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, run_classifier.PaddingInputExample): return InputFeatures(input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, start_pos={slot: 0 for slot in slot_list}, end_pos={slot: 0 for slot in slot_list}, class_label_id={slot: 0 for slot in slot_list}, is_real_example=False, guid="NONE") class_label_id_dict = {} start_pos_dict = {} end_pos_dict = {} for slot in slot_list: tokens_a, token_labels_a = tokenize_text_and_label( example.text_a, example.text_a_label, slot, tokenizer) tokens_b, token_labels_b = tokenize_text_and_label( example.text_b, example.text_b_label, slot, tokenizer) input_text_too_long = util.truncate_length_and_warn( tokens_a, tokens_b, max_seq_length, example.guid) if input_text_too_long: if ex_index < 10: if len(token_labels_a) > len(tokens_a): tf.logging.info(' tokens_a truncated labels: %s' % str(token_labels_a[len(tokens_a):])) if len(token_labels_b) > len(tokens_b): tf.logging.info(' tokens_b truncated labels: %s' % str(token_labels_b[len(tokens_b):])) token_labels_a = token_labels_a[:len(tokens_a)] token_labels_b = token_labels_b[:len(tokens_b)] assert len(token_labels_a) == len(tokens_a) assert len(token_labels_b) == len(tokens_b) token_label_ids = util.get_token_label_ids(token_labels_a, token_labels_b, max_seq_length) class_label_id_dict[slot] = class_types.index( example.class_label[slot]) start_pos_dict[slot], end_pos_dict[slot] = util.get_start_end_pos( example.class_label[slot], token_label_ids, max_seq_length) tokens, input_ids, input_mask, segment_ids = util.get_bert_input( tokens_a, tokens_b, max_seq_length, tokenizer) if ex_index < 10: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("start_pos: %s" % str(start_pos_dict)) tf.logging.info("end_pos: %s" % str(end_pos_dict)) tf.logging.info("class_label_id: %s" % str(class_label_id_dict)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, start_pos=start_pos_dict, end_pos=end_pos_dict, class_label_id=class_label_id_dict, is_real_example=True, guid=example.guid) return feature, input_text_too_long
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures( input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, is_real_example=False, ) label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[str(example.label)] if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) tf.logging.info("meta: %s" % (example.meta)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True, meta=example.meta) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures(input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, is_real_example=False) label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i input_tokens = example.text.split(" ") token_labels = example.label.split(" ") pre_tokens = [] pre_labels = [] for i, word in enumerate(input_tokens): # word tokenize, if not in vocab.txt of bert, it will use WordPiece. For word being tokenized, add label 'X' token = tokenizer.tokenize(word) pre_tokens.extend(token) pre_label = token_labels[i] for m in range(len(token)): pre_labels.append(pre_label) # if m == 0: # pre_labels.append(pre_label) # else: # pre_labels.append(pre_label) assert len(pre_tokens) == len(pre_labels), "{} \t {}".format( pre_tokens, pre_labels) # Account for [CLS] and [SEP] with "- 2" if len(pre_tokens) > max_seq_length - 2: pre_tokens = pre_tokens[0:(max_seq_length - 2)] pre_labels = pre_labels[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] label_ids = [] tokens.append("[CLS]") segment_ids.append(0) label_ids.append(label_map["[CLS]"]) for i, token in enumerate(pre_tokens): tokens.append(token) segment_ids.append(0) label_ids.append(label_map[pre_labels[i]]) tokens.append("[SEP]") segment_ids.append(0) label_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length # label_id = label_map[example.label] if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s " % " ".join([str(x) for x in label_ids])) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, is_real_example=True) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures( input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, label_weight=1., is_real_example=False) label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, label_weight=example.label_weight, is_real_example=True) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode): """ :param ex_index: example num :param example: :param label_list: all labels :param max_seq_length: :param tokenizer: WordPiece tokenization :param mode: :return: feature IN this part we should rebuild input sentences to the following format. example:[Jim,Hen,##son,was,a,puppet,##eer] labels: [I-PER,I-PER,X,O,O,O,X] """ label_map = {} #here start with zero this means that "[PAD]" is zero for (i,label) in enumerate(label_list): label_map[label] = i #with open(FLAGS.middle_output+"/label2id.pkl",'wb') as w: # pickle.dump(label_map,w) textlist = example.text.split(' ') labellist = example.label.split(' ') tokens = [] labels = [] for i,(word,label) in enumerate(zip(textlist,labellist)): token = tokenizer.tokenize(word) tokens.extend(token) for i,_ in enumerate(token): if i==0: labels.append(label) else: labels.append("X") # only Account for [CLS] with "- 1". if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 1)] labels = labels[0:(max_seq_length - 1)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") segment_ids.append(0) label_ids.append(label_map["[CLS]"]) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) # after that we don't add "[SEP]" because we want a sentence don't have # stop tag, because i think its not very necessary. # or if add "[SEP]" the model even will cause problem, special the crf layer was used. input_ids = tokenizer.convert_tokens_to_ids(ntokens) mask = [1]*len(input_ids) #use zero to padding and you should while len(input_ids) < max_seq_length: input_ids.append(0) mask.append(0) segment_ids.append(0) label_ids.append(0) ntokens.append("[PAD]") assert len(input_ids) == max_seq_length assert len(mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length assert len(ntokens) == max_seq_length if ex_index < 3: logging.info("*** Example ***") logging.info("guid: %s" % (example.guid)) logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logging.info("input_mask: %s" % " ".join([str(x) for x in mask])) logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) feature = InputFeatures( input_ids=input_ids, mask=mask, segment_ids=segment_ids, label_ids=label_ids, ) # we need ntokens because if we do predict it can help us return to original token. return feature,ntokens,label_ids
def write_instance_to_example_files(instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files): """Create TF example files from `TrainingInstance`s.""" writers = [] for output_file in output_files: writers.append(tf.python_io.TFRecordWriter(output_file)) writer_index = 0 total_written = 0 for (inst_index, instance) in enumerate(instances): input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) input_mask = [1] * len(input_ids) segment_ids = list(instance.segment_ids) assert len(input_ids) <= max_seq_length while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length masked_lm_positions = list(instance.masked_lm_positions) masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) masked_lm_weights = [1.0] * len(masked_lm_ids) while len(masked_lm_positions) < max_predictions_per_seq: masked_lm_positions.append(0) masked_lm_ids.append(0) masked_lm_weights.append(0.0) next_sentence_label = 1 if instance.is_random_next else 0 features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) features["masked_lm_positions"] = create_int_feature(masked_lm_positions) features["masked_lm_ids"] = create_int_feature(masked_lm_ids) features["masked_lm_weights"] = create_float_feature(masked_lm_weights) features["next_sentence_labels"] = create_int_feature([next_sentence_label]) tf_example = tf.train.Example(features=tf.train.Features(feature=features)) writers[writer_index].write(tf_example.SerializeToString()) writer_index = (writer_index + 1) % len(writers) total_written += 1 if inst_index < 20: tf.logging.info("*** Example ***") tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in instance.tokens])) for feature_name in features.keys(): feature = features[feature_name] values = [] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value tf.logging.info( "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) for writer in writers: writer.close() tf.logging.info("Wrote %d total instances", total_written)
def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, output_fn): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 for (example_index, example) in enumerate(examples): query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) target = example.answer # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} token_is_max_context = {} segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length if example_index < 1: tf.logging.info("*** Example ***") tf.logging.info("unique_id: %s" % (unique_id)) # tf.logging.info("example_index: %s" % (example_index)) # tf.logging.info("doc_span_index: %s" % (doc_span_index)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) # tf.logging.info("token_to_orig_map: %s" % " ".join( # ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])) # tf.logging.info("token_is_max_context: %s" % " ".join([ # "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) # ])) # tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) # tf.logging.info( # "input_mask: %s" % " ".join([str(x) for x in input_mask])) # tf.logging.info( # "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) if is_training: tf.logging.info( "target: %d" % (target)) feature = InputFeatures( unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, tokens=tokens, token_to_orig_map=token_to_orig_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, target=target) # Run callback output_fn(feature) unique_id += 1