def convert_single_example(ex_index, example, label_map, max_seq_length, tokenizer, mode): textlist = example.text.split(' ') labellist = example.label.split(' ') tokens = [] labels = [] for i, word in enumerate(textlist): print("word= ", word) token = tokenizer.tokenize(word) if '▁' in token: token.remove('▁') print("token= ", token) tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) else: labels.append("X") if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") segment_ids.append(0) # append("O") or append("[CLS]") not sure! label_ids.append(label_map["[CLS]"]) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) ntokens.append("[SEP]") segment_ids.append(0) # append("O") or append("[SEP]") not sure! label_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) # label_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # we don't concerned about it! label_ids.append(0) ntokens.append("[PAD]") print(len(input_ids)) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, ) write_tokens(ntokens, mode) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): label_map = {} for (i, label) in enumerate(label_list, 1): label_map[label] = i #print('sample example: {} {}'.format(example.text, example.label)) textlist = example.text.split(' ') labellist = example.label.split(' ') tokens = [] labels = [] for i, word in enumerate(textlist): token = tokenizer.tokenize(word) tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) else: labels.append("X") # tokens = tokenizer.tokenize(example.text) if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") segment_ids.append(0) label_ids.append(0) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) ntokens.append("[SEP]") segment_ids.append(0) label_ids.append(0) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) # print(len(input_ids)) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) feature = InputFeatures(input_ids=input_ids, input_tokens=ntokens, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids) return feature
def convert_examples_to_features(examples, seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s.""" features = [] for (ex_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > seq_length - 2: tokens_a = tokens_a[0:(seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] input_type_ids = [] tokens.append("[CLS]") input_type_ids.append(0) for token in tokens_a: tokens.append(token) input_type_ids.append(0) tokens.append("[SEP]") input_type_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) input_type_ids.append(1) tokens.append("[SEP]") input_type_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < seq_length: input_ids.append(0) input_mask.append(0) input_type_ids.append(0) assert len(input_ids) == seq_length assert len(input_mask) == seq_length assert len(input_type_ids) == seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("unique_id: %s" % (example.unique_id)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info( "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) features.append( InputFeatures( unique_id=example.unique_id, tokens=tokens, input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids)) return features
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode): """ 将一个样本进行分析,然后将字转化为id, 标签转化为id,然后结构化到InputFeatures对象中 :param ex_index: index :param example: 一个样本 :param label_list: 标签列表 :param max_seq_length: :param tokenizer: :param mode: :return: """ label_map = {} # 1表示从1开始对label进行index化 for (i, label) in enumerate(label_list, 1): label_map[label] = i # 保存label->index 的map if not os.path.exists(os.path.join(FLAGS.output_dir, 'label2id.pkl')): with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'wb') as w: pickle.dump(label_map, w) textlist = example.text.split(' ') labellist = example.label.split(' ') tokens = [] labels = [] for i, word in enumerate(textlist): # 分词,如果是中文,就是分字 token = tokenizer.tokenize(word) tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) else: # 一般不会出现else labels.append("X") # tokens = tokenizer.tokenize(example.text) # 序列截断 if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] # -2 的原因是因为序列需要加一个句首和句尾标志 labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") # 句子开始设置CLS 标志 segment_ids.append(0) # append("O") or append("[CLS]") not sure! label_ids.append( label_map["[CLS]"] ) # O OR CLS 没有任何影响,不过我觉得O 会减少标签个数,不过拒收和句尾使用不同的标志来标注,使用LCS 也没毛病 for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) ntokens.append("[SEP]") # 句尾添加[SEP] 标志 segment_ids.append(0) # append("O") or append("[SEP]") not sure! label_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids( ntokens) # 将序列中的字(ntokens)转化为ID形式 input_mask = [1] * len(input_ids) # label_mask = [1] * len(input_ids) # padding, 使用 while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # we don't concerned about it! label_ids.append(0) ntokens.append("**NULL**") # label_mask.append(0) # print(len(input_ids)) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length # assert len(label_mask) == max_seq_length # 打印部分样本数据信息 if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) # 结构化为一个类 feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, # label_mask = label_mask ) # mode='test'的时候才有效 write_tokens(ntokens, mode) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures( input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=[0] * max_seq_length, is_real_example=False) label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i labelfile = os.path.join(FLAGS.output_dir, "nerlabel2id.pkl") with open(labelfile, 'wb') as fw: pickle.dump(label_map, fw) tokens = example.text.split("\t") # tokens = tokenizer.tokenize(example.text) labels = example.label.split("\t") # Account for [CLS] and [SEP] with "- 2" if len(tokens) > max_seq_length - 2: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_id = [] ntokens.append("[CLS]") segment_ids.append(0) label_id.append(label_map["[CLS]"]) # for token in tokens: for i in range(len(tokens)): ntokens.append(tokens[i]) segment_ids.append(0) label_id.append(label_map[labels[i]]) ntokens.append("[SEP]") segment_ids.append(0) label_id.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(ntokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_id.append(label_map["[PAD]"]) ntokens.append("[PAD]") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_id) == max_seq_length assert len(ntokens) == max_seq_length # label_id = label_map[example.label] if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in ntokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_id: %s" % " ".join([str(x) for x in label_id])) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) return feature
def convert_single_example(ex_index, example, max_seq_length, tokenizer, mode): """ 将一个样本进行分析,然后将字转化为id, 标签转化为id,然后结构化到InputFeatures对象中 :param ex_index: 样本在examples列表中的index :param example: 一个InputExample样本对象,包含了source和target句子 :param max_seq_length: 序列的最大长度 :param tokenizer: tokenizer对象 :param mode: 模式,训练,验证,预测 :return: """ # 在这里采用wordpiece算法对source和target进行分词 tokens_source = tokenizer.tokenize(example.source) tokens_target = tokenizer.tokenize(example.target) # 序列截断,在这里 -2 的原因是因为序列需要加一个句首[CLS]和句尾[SEP]标志 if len(tokens_source) > max_seq_length - 2: tokens_source = tokens_source[0:(max_seq_length - 2)] if len(tokens_target) > max_seq_length - 2: tokens_target = tokens_target[0:(max_seq_length - 2)] # 对输入的token首尾分别加上[CLS]和[SEP] ntokens_source = ["[CLS]"] + tokens_source + ["[SEP]"] ntokens_target = ["[CLS]"] + tokens_target + ["[SEP]"] # 将ntokens进行index映射,转化为index的形式, 映射的vocab是传入的vocab.txt source_ids = tokenizer.convert_tokens_to_ids(ntokens_source) target_ids = tokenizer.convert_tokens_to_ids(ntokens_target) # 初始化句子向量 source_segment_ids = [0] * len(ntokens_source) target_segment_ids = [0] * len(ntokens_target) # mask,真实的token用1表示,pad用0表示,只有真实的token会被attention source_mask = [1] * len(source_ids) target_mask = [1] * len(target_ids) # 按照最大长度补全 while len(source_ids) < max_seq_length: source_ids.append(0) source_mask.append(0) source_segment_ids.append(0) while len(target_ids) < max_seq_length: target_ids.append(0) target_mask.append(0) target_segment_ids.append(0) assert len(source_ids) == max_seq_length assert len(source_mask) == max_seq_length assert len(source_segment_ids) == max_seq_length assert len(target_ids) == max_seq_length assert len(target_mask) == max_seq_length assert len(target_segment_ids) == max_seq_length # 打印部分样本数据信息 if ex_index < 1: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens_source])) tf.logging.info("source_ids: %s" % " ".join([str(x) for x in source_ids])) tf.logging.info("source_mask: %s" % " ".join([str(x) for x in source_mask])) tf.logging.info("source_segment_ids: %s" % " ".join([str(x) for x in source_segment_ids])) tf.logging.info("target_ids: %s" % " ".join([str(x) for x in target_ids])) tf.logging.info("target_mask: %s" % " ".join([str(x) for x in target_mask])) tf.logging.info("target_segment_ids: %s" % " ".join([str(x) for x in target_segment_ids])) # 实例化成一个InputFeatures对象 feature = InputFeatures(source_ids=source_ids, source_mask=source_mask, source_segment_ids=source_segment_ids, target_ids=target_ids, target_mask=target_mask, target_segment_ids=target_segment_ids) return feature
def convert_single_example_to_feature(ex_index, example, label_map, max_seq_length, tokenizer, mode): textlist = example.tokens poslist = example.poss chunklist = example.chunks labellist = example.labels tokens = [] poss = [] chunks = [] labels = [] for i, word in enumerate(textlist): token = tokenizer.tokenize(word) tokens.extend(token) pos_1 = poslist[i] chunk_1 = chunklist[i] label_1 = labellist[i] for m in range(len(token)): if m == 0: poss.append(pos_1) chunks.append(chunk_1) labels.append(label_1) else: poss.append("X") chunks.append("X") labels.append("X") # tokens = tokenizer.tokenize(example.text) if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] poss = poss[0:(max_seq_length - 2)] chunks = chunks[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] # save tokens, poss, chunks, labels back to example example.tokenized_tokens = tokens example.tokenized_poss = poss example.tokenized_chunks = chunks example.tokenized_labels = labels ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") segment_ids.append(0) # label_ids.append(label_map["[CLS]"]) label_ids.append(0) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) ntokens.append("[SEP]") segment_ids.append(0) # label_ids.append(label_map["[SEP]"]) label_ids.append(0) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) # padding while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # we don't concerned about it! label_ids.append(0) ntokens.append("**NULL**") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: print("*** Example ***") print("guid: %s" % (example.guid)) print("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) print("input_ids: %s" % " ".join([str(x) for x in input_ids])) print("input_mask: %s" % " ".join([str(x) for x in input_mask])) print("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) print("label_ids: %s" % " ".join([str(x) for x in label_ids])) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, ) return feature
def write_instance_to_example_files(instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files): """Create TF example files from `TrainingInstance`s.""" writers = [] for output_file in output_files: writers.append(tf.python_io.TFRecordWriter(output_file)) writer_index = 0 total_written = 0 for (inst_index, instance) in enumerate(instances): input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) input_mask = [1] * len(input_ids) segment_ids = list(instance.segment_ids) assert len(input_ids) <= max_seq_length while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length # print("length of segment_ids:",len(segment_ids),"max_seq_length:", max_seq_length) assert len(segment_ids) == max_seq_length masked_lm_positions = list(instance.masked_lm_positions) masked_lm_ids = tokenizer.convert_tokens_to_ids( instance.masked_lm_labels) masked_lm_weights = [1.0] * len(masked_lm_ids) while len(masked_lm_positions) < max_predictions_per_seq: masked_lm_positions.append(0) masked_lm_ids.append(0) masked_lm_weights.append(0.0) next_sentence_label = 1 if instance.is_random_next else 0 features = collections.OrderedDict() features["input_ids"] = create_int_feature(input_ids) features["input_mask"] = create_int_feature(input_mask) features["segment_ids"] = create_int_feature(segment_ids) features["masked_lm_positions"] = create_int_feature( masked_lm_positions) features["masked_lm_ids"] = create_int_feature(masked_lm_ids) features["masked_lm_weights"] = create_float_feature(masked_lm_weights) features["next_sentence_labels"] = create_int_feature( [next_sentence_label]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writers[writer_index].write(tf_example.SerializeToString()) writer_index = (writer_index + 1) % len(writers) total_written += 1 if inst_index < 20: tf.logging.info("*** Example ***") tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in instance.tokens])) for feature_name in features.keys(): feature = features[feature_name] values = [] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value tf.logging.info( "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) for writer in writers: writer.close() tf.logging.info("Wrote %d total instances", total_written)
def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, verbose_logging=False, logger=None): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 features = [] for (example_index, example) in enumerate(examples): query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) tok_start_position = None tok_end_position = None if is_training: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} token_is_max_context = {} segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len( tokens)] = tok_to_orig_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length start_position = None end_position = None if is_training: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = 0 end_position = 0 else: doc_offset = len(query_tokens) + 2 start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset if example_index < 2 and verbose_logging: logger.info("*** Example ***") logger.info("unique_id: %s" % (unique_id)) logger.info("example_index: %s" % (example_index)) logger.info("doc_span_index: %s" % (doc_span_index)) logger.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) logger.info("token_to_orig_map: %s" % " ".join([ "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map) ])) logger.info("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) if is_training: answer_text = " ".join( tokens[start_position:(end_position + 1)]) logger.info("start_position: %d" % (start_position)) logger.info("end_position: %d" % (end_position)) logger.info("answer: %s" % (tokenization.printable_text(answer_text))) features.append( InputFeatures(unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, tokens=tokens, token_to_orig_map=token_to_orig_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, start_position=start_position, end_position=end_position)) unique_id += 1 if len(features) % 5000 == 0: logger.info("Processing features: %d" % (len(features))) return features
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s. Convert examples into token form as input of BERT model. """ label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i input_data = [] guids = [] for (ex_index, example) in enumerate(examples): guid = example.guid guids.append(guid) tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambigiously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 3: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) features = collections.OrderedDict() features["input_ids"] = input_ids features["input_mask"] = input_mask features["segment_ids"] = segment_ids features["label_ids"] = label_id features["guid"] = guids print(guids) input_data.append(features) return input_data
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode): """ :param ex_index: example num :param example: :param label_list: all labels :param max_seq_length: :param tokenizer: WordPiece tokenization :param mode: :return: feature IN this part we should rebuild input sentences to the following format. example:[Jim,Hen,##son,was,a,puppet,##eer] labels: [B,I,X,O,O,O,X] """ label_map = {} # here start with zero which means that "[PAD]" is zero # start with 1, 0 for paddding for i, label in enumerate(label_list, 1): label_map[label] = i with open(FLAGS.output_dir + '/label2id.pkl', 'wb+') as wf: pickle.dump(label_map, wf) textlist = example.text.split(' ') labellist = example.label.split(' ') tokens = [] labels = [] for i, (word, label) in enumerate(zip(textlist, labellist)): token = tokenizer.tokenize(word) tokens.extend(token) for m in range(len(token)): if m == 0: labels.append(label) else: labels.append('X') # only Account for [CLS] with "- 1". # account for ending signal [SEP], with "- 2" if len(tokens) >= max_seq_length - 1: tokens = tokens[0:max_seq_length - 2] labels = labels[0:max_seq_length - 2] ntokens = [] segment_ids = [] label_ids = [] # begining signal [CLS] ntokens.append("[CLS]") segment_ids.append(0) label_ids.append(label_map["[CLS]"]) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) ''' after that we don't add "[SEP]" because we want a sentence don't have stop tag, because i think its not very necessary. or if add "[SEP]" the model even will cause problem, special the crf layer was used. ''' # endinging signal [SEP] ntokens.append("[SEP]") segment_ids.append(0) label_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) #use zero to padding sequence while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) ntokens.append("**NULL**") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: logging.info("*** Example ***") logging.info("guid: %s" % (example.guid)) logging.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, ) # we need ntokens because if we do predict it can help us return to original token. return feature, ntokens, label_ids
def convert_single_example(ex_index, example, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" tokens = tokenizer.tokenize(example.text) # Account for [CLS] and [SEP] with "- 2" if len(tokens) > max_seq_length - 2: tokens = tokens[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = ["[CLS]"] + tokens + ["[SEP]"] segment_ids = [0] * len(tokens) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("labels: %s" % " ".join([str(x) for x in example.labels])) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=example.labels) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures(input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, is_real_example=False) label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i with open(FLAGS.middle_output + "/label2id.pkl", 'wb') as w: pickle.dump(label_map, w) tokens_a = tokenizer.tokenize(example.text_a) if len(tokens_a) > max_seq_length - 1: tokens_a = tokens_a[0:(max_seq_length - 1)] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 3: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) return feature, label_id
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode): label_map = {} for (i, label) in enumerate(label_list, 1): label_map[label] = i with open('./output/label2id.pkl', 'wb') as w: pickle.dump(label_map, w) textlist = example.text.split(' ') labellist = example.label.split(' ') tokens = [] labels = [] for i, word in enumerate(textlist): token = tokenizer.tokenize(word) tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) else: labels.append("X") # tokens = tokenizer.tokenize(example.text) if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") segment_ids.append(0) # append("O") or append("[CLS]") not sure! label_ids.append(label_map["[CLS]"]) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) ntokens.append("[SEP]") segment_ids.append(0) # append("O") or append("[SEP]") not sure! label_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) # label_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # we don't concerned about it! label_ids.append(0) ntokens.append("**NULL**") # label_mask.append(0) # print(len(input_ids)) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length # assert len(label_mask) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, # label_mask = label_mask ) write_tokens(ntokens, mode) write_labels(labels, mode) return feature
def convert_single_example(ex_index, example, token_label_list, predicate_label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures( input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, token_label_ids=[0] * max_seq_length, predicate_label_id = [0], is_real_example=False) token_label_map = {} for (i, label) in enumerate(token_label_list): token_label_map[label] = i predicate_label_map = {} for (i, label) in enumerate(predicate_label_list): predicate_label_map[label] = i text_token = example.text_token.split("\t")[0].split(" ") if example.token_label is not None: token_label = example.token_label.split("\t")[0].split(" ") else: token_label = ["O"] * len(text_token) assert len(text_token) == len(token_label) text_predicate = example.text_token.split("\t")[1] if example.token_label is not None: token_predicate = example.token_label.split("\t")[1] else: token_predicate = text_predicate assert text_predicate == token_predicate tokens_b = [text_predicate] * len(text_token) predicate_id = predicate_label_map[text_predicate] _truncate_seq_pair(text_token, tokens_b, max_seq_length - 3) tokens = [] token_label_ids = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) token_label_ids.append(token_label_map["[CLS]"]) for token, label in zip(text_token, token_label): tokens.append(token) segment_ids.append(0) token_label_ids.append(token_label_map[label]) tokens.append("[SEP]") segment_ids.append(0) token_label_ids.append(token_label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(tokens) #bert_tokenizer.convert_tokens_to_ids(["[SEP]"]) --->[102] bias = 1 #1-100 dict index not used for token in tokens_b: input_ids.append(predicate_id + bias) #add bias for different from word dict segment_ids.append(1) token_label_ids.append(token_label_map["[category]"]) input_ids.append(tokenizer.convert_tokens_to_ids(["[SEP]"])[0]) #102 segment_ids.append(1) token_label_ids.append(token_label_map["[SEP]"]) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) token_label_ids.append(0) tokens.append("[Padding]") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(token_label_ids) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("token_label_ids: %s" % " ".join([str(x) for x in token_label_ids])) tf.logging.info("predicate_id: %s" % str(predicate_id)) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, token_label_ids=token_label_ids, predicate_label_id=[predicate_id], is_real_example=True) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): '''Converts a single `InputClass` into a single `InputFeatures`.''' if isinstance(example, PaddingClass): return InputFeatures(input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, is_real_example=False) label_map = {label: i for (i, label) in enumerate(label_list)} tokens_a = tokenizer.tokenize(example.text_a) tokens_b = tokenizer.tokenize(example.text_b) if example.text_b else None if tokens_b: # opt. for no take_b, - 3 = [CLS] + [SEP] * 2 _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # - 2 = [CLS] + [SEP] if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 tokens = ['[CLS]', *[ii for ii in tokens_a], '[SEP]'] segment_ids = [0] * (len(tokens_a) + 1) # No.X sentences if tokens_b: tokens = [*tokens, *[ii for ii in tokens_b], '[SEP]'] segment_ids = [*segment_ids, *[1] * (len(tokens_b) + 1)] input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) # mask 0: pad; 1: real pad_len = max_seq_length - len(input_ids) if not pad_len: # zero-padded input_ids = [*input_ids, *[0] * pad_len] input_mask = [*input_mask, *[0] * pad_len] segment_ids = [*segment_ids, *[0] * pad_len] assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 5: tf.logging.info(">>>>> Example >>>>>") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures(input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, is_real_example=False) tokens_a = tokenizer.tokenize(example.text_a) tokens_b = tokenizer.tokenize(example.text_b) while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_seq_length - 3: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length if example.label in label_list: label_id = int(example.label) else: label_id = -1 if ex_index < 3: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) raw_tokens = example.text_a + '[SEP]' + example.text_b return feature, raw_tokens, label_id
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures(input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, is_real_example=False) label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, forward, backward, mode): """ :param ex_index: example num :param example: :param label_list: all labels :param max_seq_length: :param tokenizer: WordPiece tokenization :param mode: :return: feature IN this part we should rebuild input sentences to the following format. example:[Jim,Hen,##son,was,a,puppet,##eer] labels: [B,I,X,O,O,O,X] """ label_map = dict() # here start with zero which means that "[PAD]" is zero # start with 1, 0 for paddding for i, label in enumerate(label_list, 1): label_map[label] = i with open('./data/{}/label2id.pkl'.format(FLAGS.dataset), 'wb+') as wf: pickle.dump(label_map, wf) pos2id = dict() with open('./data/{}/pos2id.pkl'.format(FLAGS.dataset), 'rb') as rf: pos2id = pickle.load(rf) with open('./data/{}/mark2id.pkl'.format(FLAGS.dataset), 'rb') as rf2: mark2id = pickle.load(rf2) textlist = example.text.split(' ') labellist = example.label.split(' ') # featurelist = [list(map(int, features.split('-'))) for features in example.feature.split(' ')] poslist = example.pos.split(' ') marklist = example.mark.split(' ') tokens = [] labels = [] poses = [] marks = [] for i, (word, label, pos, mark) in enumerate(zip(textlist, labellist, poslist, marklist)): token = tokenizer.tokenize(word) tokens.extend(token) for m in range(len(token)): if m == 0: labels.append(label) poses.append(pos) marks.append(mark) else: labels.append('X') poses.append(0) marks.append(0) # only Account for [CLS] with "- 1". # account for ending signal [SEP], with "- 2" if len(tokens) >= max_seq_length - 1: tokens = tokens[0: max_seq_length - 2] labels = labels[0: max_seq_length - 2] poses = poses[0: max_seq_length - 2] marks = marks[0: max_seq_length - 2] ntokens = [] segment_ids = [] label_ids = [] pos_ids = [] mark_ids = [] # begin signal [CLS] ntokens.append("[CLS]") segment_ids.append(0) label_ids.append(label_map["[CLS]"]) pos_ids.append(pos2id["[CLS]"]) mark_ids.append(mark2id["[CLS]"]) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) pos_ids.append(int(poses[i])) mark_ids.append(int(marks[i])) # ending signal [SEP] ntokens.append("[SEP]") segment_ids.append(0) label_ids.append(label_map["[SEP]"]) pos_ids.append(pos2id["[SEP]"]) mark_ids.append(pos2id["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) # use zero to padding sequence while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) pos_ids.append(0) mark_ids.append(0) ntokens.append("**NULL**") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length assert len(pos_ids) == max_seq_length assert len(mark_ids) == max_seq_length if ex_index < 5: logging.info("*** Example ***") logging.info("guid: %s" % example.guid) logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) logging.info("pos_ids: %s" % " ".join([str(x) for x in pos_ids])) logging.info("mark_ids: %s" % " ".join([str(x) for x in mark_ids])) logging.info("ntokens: %s" % " ".join([str(x) for x in ntokens])) feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, pos_ids=pos_ids, mark_ids=mark_ids, forward=forward, backward=backward, ) # we need ntokens because if we do predict it can help us return to original token. return feature, ntokens, label_ids
def convert_single_example(ex_index, example, entity_label_list, class_label_list, max_seq_length, tokenizer, mode): class_label_map = {} for (i, label) in enumerate(class_label_list): class_label_map[label] = i label_map = {} label_map_reference = {} for (i, label) in enumerate(entity_label_list, 1): label_map[label] = i label_map_reference[label] = i label_map_reference['x'] = len(entity_label_list) + 1 label2idpath = os.path.join(FLAGS.output_dir, 'label2id.json') label2idpath_reference = os.path.join(FLAGS.output_dir, 'label2id_reference.json') if not os.path.exists(label2idpath_reference): with open(label2idpath_reference, 'w', encoding='utf8') as w: json.dump(label_map_reference, w) textlist = example.text.split() labellist = list(example.entity_label) tokens = [] labels = [] reference_labels = [] unknow_index = [] for i, word in enumerate(textlist): token = tokenizer.tokenize(word) tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) reference_labels.append(label_1) else: if label_1 == 'o': labels.append('o') else: labels.append('m') reference_labels.append('x') if token[m] == "[UNK]": unknow_index.append(i) assert len(tokens) == len(labels) == len(reference_labels) if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] reference_labels = reference_labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] reference_label_ids = [] ntokens.append("[CLS]") segment_ids.append(0) label_ids.append(label_map['o']) reference_label_ids.append(label_map_reference['o']) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) reference_label_ids.append(label_map_reference[reference_labels[i]]) ntokens.append("[SEP]") segment_ids.append(0) label_ids.append(label_map['o']) reference_label_ids.append(label_map_reference['o']) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) class_label = class_label_map[example.class_label] while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) reference_label_ids.append(0) ntokens.append("**NULL**") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length assert len(reference_label_ids) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, reference_label_ids=reference_label_ids, class_label=class_label) output_tokens = [] for i, each in enumerate(ntokens): if each != "[UNK]": output_tokens.append(each) else: index = unknow_index[0] output_tokens.append(textlist[index]) unknow_index = unknow_index[1:] write_tokens(output_tokens, mode) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """ 将一个样本进行分析,然后将字转化为id, 标签转化为id,然后结构化到InputFeatures对象中 :param ex_index: index :param example: 一个样本 :param label_list: 标签列表 :param max_seq_length: :param tokenizer: :return: """ label_map = {} # 1表示从1开始对label进行index化 for (i, label) in enumerate(label_list, 1): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) label_a = example.label.split(' ') if len(tokens_a) > max_seq_length: tokens_a = tokens_a[0:max_seq_length] label_a = label_a[0:max_seq_length] tokens = [] label_ids = [] segment_ids = [] for index, token in enumerate(tokens_a): tokens.append(token) label_ids.append(label_map[label_a[index]]) segment_ids.append(0) input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) label_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids) return feature
def convert_single_example_train(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode): label_map = {} for (i, label) in enumerate(label_list, 1): label_map[label] = i if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')): with codecs.open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as w: pickle.dump(label_map, w) textlist = example.text.split(' ') labellist = example.label.split(' ') tokens = [] labels = [] for i, word in enumerate(textlist): # 分词,如果是中文,就是分字,但是对于一些不在BERT的vocab.txt中得字符会被进行WordPice处理 token = tokenizer.tokenize(word) tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) else: # 一般不会出现else labels.append("X") if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] ntokens = ["[CLS]"] segment_ids = [0] label_ids = [label_map["[CLS]"]] for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) ntokens.append("[SEP]") segment_ids.append(0) label_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) ntokens.append("**NULL**") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids) # mode='test'的时候才有效 write_tokens(ntokens, output_dir, mode) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """ :param ex_index: example num :param example: :param label_list: all labels :param max_seq_length: :param tokenizer: WordPiece tokenization :param mode: :return: feature IN this part we should rebuild input sentences to the following format. example:[Jim,Hen,##son,was,a,puppet,##eer] labels: [I-PER,I-PER,X,O,O,O,X] """ label_map = {} # # here start with zero this means that "[PAD]" is zero for (i, label) in enumerate(label_list): label_map[label] = i with open(info.middle_output + "/label2id.pkl", 'wb') as w: pickle.dump(label_map, w) textlist = example.text.split(' ') labellist = example.label.split(' ') tokens = [] labels = [] for i, (word, label) in enumerate(zip(textlist, labellist)): token = tokenizer.tokenize(word) tokens.extend(token) for i, _ in enumerate(token): if i == 0: labels.append(label) else: labels.append("X") # only Account for [CLS] with "- 1". if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 1)] labels = labels[0:(max_seq_length - 1)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") segment_ids.append(0) label_ids.append(label_map["[CLS]"]) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) # after that we don't add "[SEP]" because we want a sentence don't have # stop tag, because i think its not very necessary. # or if add "[SEP]" the model even will cause problem, special the crf layer was used. input_ids = tokenizer.convert_tokens_to_ids(ntokens) mask = [1] * len(input_ids) # use zero to padding and you should while len(input_ids) < max_seq_length: input_ids.append(0) mask.append(0) segment_ids.append(0) label_ids.append(0) ntokens.append("[PAD]") assert len(input_ids) == max_seq_length assert len(mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length assert len(ntokens) == max_seq_length if ex_index < 3: logging.info("*** Example ***") logging.info("guid: %s" % (example.guid)) logging.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logging.info("input_mask: %s" % " ".join([str(x) for x in mask])) logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) feature = InputFeatures( input_ids=input_ids, mask=mask, segment_ids=segment_ids, label_ids=label_ids, ) # we need ntokens because if we do predict it can help us return to original token. # return feature, ntokens, label_ids return feature
def convert_single_example_dev(ex_index, text, label, label2id, max_seq_length, tokenizer): """ 将一个样本进行分析,然后将字转化为id, 标签转化为id,然后结构化到InputFeatures对象中 :param ex_index: index :param example: 一个样本 :param label_list: 标签列表 :param max_seq_length: :param tokenizer: :param mode: :return: """ O_index = label2id["O"] # L: ['B-ORG', 'M-ORG', 'M-ORG', 'M-ORG'] # W: ['中', '共', '中', '央'] textlist = text.split(' ') labellist = label.split(' ') tokens = [] labels = [] for i, word in enumerate(textlist): # 对每个字进行tokenize,返回list token = tokenizer.tokenize(word) tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) else: # 一般不会出现else labels.append("X") # 序列截断 if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] # -2 的原因是因为序列需要加一个句首和句尾标志 labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") # 句子开始设置CLS 标志 segment_ids.append(0) label_ids.append(label2id["[CLS]"]) # for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label2id[labels[i]]) ntokens.append("[SEP]") # 句尾添加[SEP] 标志 segment_ids.append(0) label_ids.append(label2id["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids( ntokens) # 将序列中的字(ntokens)转化为ID形式 input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # label用O去padding label_ids.append(O_index) ntokens.append("[PAD]") # label_mask.append(0) # print(len(input_ids)) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length # assert len(label_mask) == max_seq_length # 打印部分样本数据信息 if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) # 结构化为一个类 feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, # label_mask = label_mask ) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures(input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, is_real_example=False) label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode): """ 将一个样本进行分析,然后将字转化为id, 标签转化为id,然后结构化到InputFeatures对象中 :param ex_index: index :param example: 一个样本 :param label_list: 标签列表 :param max_seq_length: :param tokenizer: :param mode: :return: """ label_map = {} # 1表示从1开始对label进行index化 for (i, label) in enumerate(label_list, 1): label_map[label] = i # 保存label->index 的map with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'wb') as w: pickle.dump(label_map, w) textlist = example.text.split(' ') labellist = example.label.split(' ') tokens = [] labels = [] for i, word in enumerate(textlist): # 分词,如果是中文,就是分字 token = tokenizer.tokenize(word) tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) else: # 一般不会出现else labels.append("X") # tokens = tokenizer.tokenize(example.text) # 序列截断 if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] # -2 的原因是因为序列需要加一个句首和句尾标志 labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") # 句子开始设置CLS 标志 segment_ids.append(0) # append("O") or append("[CLS]") not sure! label_ids.append(label_map["[CLS]"]) # O OR CLS 没有任何影响,不过我觉得O 会减少标签个数,不过拒收和句尾使用不同的标志来标注,使用LCS 也没毛病 for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) ntokens.append("[SEP]") # 句尾添加[SEP] 标志 segment_ids.append(0) # append("O") or append("[SEP]") not sure! label_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(ntokens) # 将序列中的字(ntokens)转化为ID形式 input_mask = [1] * len(input_ids) # label_mask = [1] * len(input_ids) # padding, 使用 while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # we don't concerned about it! label_ids.append(0) ntokens.append("**NULL**") # label_mask.append(0) # print(len(input_ids)) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length # assert len(label_mask) == max_seq_length # 打印部分样本数据信息 if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) # 结构化为一个类 feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, # label_mask = label_mask ) # mode='test'的时候才有效 write_tokens(ntokens, mode) return feature
def convert_single_example(ex_index, example, token_label_list, sent_label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures( input_ids=[0] * max_seq_length, input_masks=[0] * max_seq_length, segment_ids=[0] * max_seq_length, token_label_ids=[0] * max_seq_length, sent_label_id=0) token_label_map = {} for (i, token_label) in enumerate(token_label_list): token_label_map[token_label] = i sent_label_map = {} for (i, sent_label) in enumerate(sent_label_list): sent_label_map[sent_label] = i token_items = example.text.split(" ") token_label_items = example.token_label.split(" ") tokens = [] token_labels = [] for token, token_label in zip(token_items, token_label_items): token_subitems = tokenizer.tokenize(token) token_label_subitems = [token_label] + ["X"] * (len(token_subitems) - 1) tokens.extend(token_subitems) token_labels.extend(token_label_subitems) if len(tokens) > max_seq_length - 2: tokens = tokens[0:(max_seq_length - 2)] if len(token_labels) > max_seq_length - 2: token_labels = token_labels[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. input_tokens = [] segment_ids = [] token_label_ids = [] sent_label_id = sent_label_map[example.sent_label] input_tokens.append("[CLS]") segment_ids.append(0) token_label_ids.append(token_label_map["[CLS]"]) for i, token in enumerate(tokens): input_tokens.append(token) segment_ids.append(0) token_label_ids.append(token_label_map[token_labels[i]]) input_tokens.append("[SEP]") segment_ids.append(0) token_label_ids.append(token_label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids(input_tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. input_masks = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_masks.append(0) segment_ids.append(0) token_label_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_masks) == max_seq_length assert len(segment_ids) == max_seq_length assert len(token_label_ids) == max_seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_masks: %s" % " ".join([str(x) for x in input_masks])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("token_label_ids: %s" % " ".join([str(x) for x in token_label_ids])) tf.logging.info("sent_label_id: %s" % str(sent_label_id)) feature = InputFeatures( input_ids=input_ids, input_masks=input_masks, segment_ids=segment_ids, token_label_ids=token_label_ids, sent_label_id=sent_label_id) return feature