def get_bert_embeddings(self, flattened_input_ids, flattened_input_mask, is_training: bool): """ applying BERT to each sliding window, and get token embeddings corresponding to the right tokens :param flattened_input_ids: [-1] :param flattened_input_mask: [-1] :param is_training: :return: (num_tokens, embed_size) """ input_ids = tf.reshape(flattened_input_ids, [-1, self.config.sliding_window_size]) input_mask = tf.reshape(flattened_input_mask, [-1, self.config.sliding_window_size]) actual_mask = tf.cast(tf.not_equal(input_mask, self.config.pad_idx), tf.int32) with tf.variable_scope('bert', reuse=tf.AUTO_REUSE): bert_model = BertModel(self.bert_config, is_training, input_ids, actual_mask, scope='bert') bert_embeddings = bert_model.get_sequence_output( ) # (num_windows, window_size, embed_size) flattened_embeddings = tf.reshape(bert_embeddings, [-1, self.bert_config.hidden_size]) flattened_mask = tf.greater_equal(flattened_input_mask, 0) output_embeddings = tf.boolean_mask(flattened_embeddings, flattened_mask) print('xixi', bert_embeddings.get_shape(), output_embeddings.get_shape(), flattened_embeddings.get_shape(), flattened_mask.get_shape()) return output_embeddings
class CheckpointSmallBERT(AbstractBase): def __init__(self, path, training=False, max_seq_length=512): self.max_seq_length = max_seq_length self.graph = tf.Graph() with self.graph.as_default(): self.input_ids = tf.compat.v1.placeholder( tf.int32, shape=(None, self.max_seq_length)) self.input_mask = tf.compat.v1.placeholder( tf.int32, shape=(None, self.max_seq_length)) self.segment_ids = tf.compat.v1.placeholder( tf.int32, shape=(None, self.max_seq_length)) self.bert_config = BertConfig.from_json_file(path + '/bert_config.json') self.bert_module = BertModel(config=self.bert_config, is_training=training, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=False) assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint( tf.trainable_variables(), path + '/bert_model.ckpt') tf.train.init_from_checkpoint(path + '/bert_model.ckpt', assignment_map) self.sess = tf.compat.v1.Session() self.sess.run( tf.group(tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer())) self.bert_outputs = { 'sequence_output': self.bert_module.get_sequence_output(), 'pooled_output': self.bert_module.get_pooled_output(), } self.tok = tokenization.FullTokenizer(vocab_file=path + '/vocab.txt', do_lower_case=True)
def __init__(self): bert_pretrained_dir = args.pretrain_models_path + args.bert_model_name self.do_lower_case = args.bert_model_name.startswith('uncased') self.vocab_file = os.path.join(bert_pretrained_dir, 'vocab.txt') self.config_file = os.path.join(bert_pretrained_dir, 'bert_config.json') self.tokenizer = FullTokenizer(vocab_file=self.vocab_file, do_lower_case=self.do_lower_case) self.input_id = tf.placeholder(tf.int64, [None, None], 'input_ids') self.input_mask = tf.placeholder(tf.int64, [None, None], 'input_mask') self.segment_ids = tf.placeholder(tf.int64, [None, None], 'segment_ids') bert_config = BertConfig.from_json_file(self.config_file) model = BertModel(config=bert_config, is_training=False, input_ids=self.input_id, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=True, scope='bert') self.output_layer = model.get_sequence_output() self.embedding_layer = model.get_embedding_output() saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.session = tf.Session(config=config) saver.restore(self.session, bert_pretrained_dir + '/bert_model.ckpt')
def model_fn(features, labels, mode, params): input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] model = BertModel(config, True, input_ids, input_mask, segment_ids) final_hidden = model.get_sequence_output() return final_hidden
def qa_loop_body(i, starts, ends, labels, scores): input_ids = tf.reshape(flattened_input_ids, [-1, self.config.sliding_window_size ]) # (num_windows, window_size) input_mask = tf.reshape(flattened_input_mask, [-1, self.config.sliding_window_size]) actual_mask = tf.cast(tf.not_equal(input_mask, self.config.pad_idx), tf.int32) # (num_windows, window_size) num_windows = tf.shape(actual_mask)[0] question_tokens = self.get_question_token_ids( sentence_map, flattened_input_ids, flattened_input_mask, top_span_starts[i], top_span_ends[i]) # (num_question_tokens) tiled_question = tf.tile( tf.expand_dims(question_tokens, 0), [num_windows, 1]) # (num_windows, num_ques_tokens) question_ones = tf.ones_like(tiled_question, dtype=tf.int32) question_zeros = tf.zeros_like(tiled_question, dtype=tf.int32) qa_input_ids = tf.concat( [tiled_question, input_ids], 1) # (num_windows, num_ques_tokens + window_size) qa_input_mask = tf.concat( [question_ones, actual_mask], 1) # (num_windows, num_ques_tokens + window_size) token_type_ids = tf.concat([question_zeros, actual_mask], 1) with tf.variable_scope('bert', reuse=tf.AUTO_REUSE): bert_model = BertModel(self.bert_config, is_training, qa_input_ids, qa_input_mask, token_type_ids, scope='bert') bert_embeddings = bert_model.get_sequence_output( ) # num_windows, num_ques_tokens + window_size, embed_size flattened_embeddings = tf.reshape( bert_embeddings, [-1, self.bert_config.hidden_size]) output_mask = tf.concat( [-1 * question_ones, input_mask], 1) # (num_windows, num_ques_tokens + window_size) flattened_mask = tf.reshape(tf.greater_equal(output_mask, 0), [-1]) qa_embeddings = tf.boolean_mask( flattened_embeddings, flattened_mask) # (num_tokens, embed_size) qa_scores, qa_indices, qa_starts, qa_ends, qa_embs = self.filter_by_mention_scores( qa_embeddings, candidate_starts, candidate_ends, dropout, c) qa_cluster_ids = self.get_top_span_cluster_ids( candidate_starts, candidate_ends, span_starts, span_ends, cluster_ids, qa_indices) return (i + 1, tf.concat( [starts, tf.expand_dims(qa_starts, axis=0)], axis=0), tf.concat([ends, tf.expand_dims(qa_ends, axis=0)], axis=0), tf.concat([labels, tf.expand_dims(qa_cluster_ids, axis=0)], axis=0), tf.concat( [scores, tf.expand_dims(qa_scores, axis=0)], axis=0))
def body(self, features, mode): """Body of the model, aka Bert Arguments: features {dict} -- feature dict, keys: input_ids, input_mask, segment_ids mode {mode} -- mode Returns: dict -- features extracted from bert. keys: 'seq', 'pooled', 'all', 'embed' seq: tensor, [batch_size, seq_length, hidden_size] pooled: tensor, [batch_size, hidden_size] all: list of tensor, num_hidden_layers * [batch_size, seq_length, hidden_size] embed: tensor, [batch_size, seq_length, hidden_size] """ config = self.config input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = BertModel(config=config.bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=config.use_one_hot_embeddings) feature_dict = {} for logit_type in ['seq', 'pooled', 'all', 'embed', 'embed_table']: if logit_type == 'seq': # tensor, [batch_size, seq_length, hidden_size] feature_dict[logit_type] = model.get_sequence_output() elif logit_type == 'pooled': # tensor, [batch_size, hidden_size] feature_dict[logit_type] = model.get_pooled_output() elif logit_type == 'all': # list, num_hidden_layers * [batch_size, seq_length, hidden_size] feature_dict[logit_type] = model.get_all_encoder_layers() elif logit_type == 'embed': # for res connection feature_dict[logit_type] = model.get_embedding_output() elif logit_type == 'embed_table': feature_dict[logit_type] = model.get_embedding_table() return feature_dict
def get_model(self): logging.info("get bert model") graph = tf.Graph() with graph.as_default(): ph_input_ids = tf.placeholder(dtype=tf.int32, shape=[None, self._seq_length + 2], name="ph_input_ids") con = BertConfig.from_json_file(config.PROJECT_ROOT + "/bert_config.json") bert_model = BertModel(config=con, is_training=False, input_ids=ph_input_ids, use_one_hot_embeddings=True) output = bert_model.get_sequence_output() init = tf.global_variables_initializer() sess = tf.Session(graph=graph) sess.run(init) return sess, ph_input_ids, output
class BertEncoder(object): def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None): self.model = BertModel(config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) self.embeddings_table = self.model.get_embedding_table() def encode(self): #encoded is => sequence_output` shape = [batch_size, seq_length, hidden_size]. output = self.model.get_sequence_output() states = () for layer in self.model.get_all_encoder_layers(): states += (tf.reduce_mean(layer, axis=1), ) return output, states,
def main(_): logging.set_verbosity(logging.INFO) for i in range(_NUM_PARTITIONS): tf.io.gfile.makedirs( os.path.join(FLAGS.output_bert_feature_dir, '%02d' % i)) # Create Bert model. bert_tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.bert_vocab_file, do_lower_case=FLAGS.do_lower_case) # Bert prediction. input_placeholder = tf.placeholder(shape=[None], dtype=tf.string) token_to_id_layer = token_to_id.TokenToIdLayer(FLAGS.bert_vocab_file, unk_token_id=UNK) bert_config = BertConfig.from_json_file(FLAGS.bert_config_file) bert_model = BertModel(bert_config, is_training=False, input_ids=token_to_id_layer( tf.expand_dims(input_placeholder, 0))) sequence_output = bert_model.get_sequence_output()[0] pooled_output = bert_model.get_pooled_output()[0] saver = tf.compat.v1.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.compat.v1.Session(config=config) sess.run(tf.compat.v1.tables_initializer()) saver.restore(sess, FLAGS.bert_checkpoint_file) for name in sess.run(tf.compat.v1.report_uninitialized_variables()): logging.warn('%s is uninitialized!', name) def _bert_fn(sequence): return sess.run([sequence_output, pooled_output], feed_dict={input_placeholder: sequence}) # Load annotations. annots = _load_annotations(FLAGS.annotations_jsonl_file) logging.info('Loaded %i annotations.', len(annots)) shard_id, num_shards = FLAGS.shard_id, FLAGS.num_shards assert 0 <= shard_id < num_shards for idx, annot in enumerate(annots): if (idx + 1) % 1000 == 0: logging.info('On example %i/%i.', idx + 1, len(annots)) annot_id = int(annot['annot_id'].split('-')[-1]) if annot_id % num_shards != shard_id: continue # Check npy file. part_id = get_partition_id(annot['annot_id']) output_file = os.path.join(FLAGS.output_bert_feature_dir, '%02d' % part_id, annot['annot_id'] + '.npy') if os.path.isfile(output_file): logging.info('%s is there.', output_file) continue annot_id = int(annot['annot_id'].split('-')[-1]) if annot_id % num_shards != shard_id: continue # Create TF example. bert_outputs = _create_bert_embeddings(annot, bert_tokenizer, FLAGS.do_lower_case, _bert_fn) with open(output_file, 'wb') as f: np.save(f, bert_outputs) logging.info('Done')
def build(self, data_iter, bert_config_file): # get the inputs with tf.variable_scope('inputs'): input_map = data_iter.get_next() usrid, prdid, input_x, input_y, doc_len = \ (input_map['usr'], input_map['prd'], input_map['content'], input_map['rating'], input_map['doc_len']) input_x = tf.cast(input_x, tf.int32) self.usr = lookup(self.embeddings['usr_emb'], usrid, name='cur_usr_embedding') self.prd = lookup(self.embeddings['prd_emb'], prdid, name='cur_prd_embedding') # input_x = lookup(self.embeddings['wrd_emb'], input_x, name='cur_wrd_embedding') input_x = tf.reshape(input_x, [-1, self.max_doc_len]) input_mask = tf.sequence_mask(doc_len, self.max_doc_len) input_mask = tf.cast(input_mask, tf.int32) bert_config = BertConfig.from_json_file(bert_config_file) bert = BertModel(bert_config, is_training=True, input_ids=input_x, input_mask=input_mask, token_type_ids=None, use_one_hot_embeddings=False) pooled_output = bert.get_pooled_output() sequence_output = bert.get_sequence_output() alphas = attention(sequence_output, None, self.max_doc_len, self.max_doc_len) sequence_output = tf.matmul(alphas, sequence_output) sequence_output = tf.squeeze(sequence_output, axis=1) bert_output = tf.concat([pooled_output, sequence_output], axis=1) logits = tf.layers.dense( bert_output, self.cls_cnt, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02)) self.bert_output = bert_output self.logits = logits # build the process of model prediction = tf.argmax(logits, 1, name='prediction') self.prediction = prediction with tf.variable_scope("loss"): sce = tf.nn.softmax_cross_entropy_with_logits_v2 log_probs = tf.nn.log_softmax(logits) self.probs = tf.nn.softmax(logits) loss = -tf.reduce_sum(tf.one_hot( input_y, self.cls_cnt, dtype=tf.float32) * log_probs, axis=-1) self.loss = tf.reduce_mean(loss) # self.loss = sce(logits=logits, labels=tf.one_hot(input_y, self.cls_cnt)) # self.loss = tf.reduce_mean(self.loss) self.total_loss = tf.reduce_sum(loss) prediction = tf.argmax(logits, 1, name='prediction') with tf.variable_scope("metrics"): correct_prediction = tf.equal(prediction, input_y) self.correct = correct_prediction mse = tf.reduce_sum(tf.square(prediction - input_y), name="mse") correct_num = tf.reduce_sum(tf.cast(correct_prediction, dtype=tf.int32), name="correct_num") accuracy = tf.reduce_sum(tf.cast(correct_prediction, "float"), name="accuracy") return self.total_loss, mse, correct_num, accuracy
class ChatModel: def __init__(self, chatmodel_config): self.chatmodel_config = chatmodel_config self.max_x_len = chatmodel_config.max_x_len self.max_y_len = chatmodel_config.max_y_len self.decode_max_len = chatmodel_config.max_decode_len self.vocab = chatmodel_config.vocab self.config_file = chatmodel_config.config_file self.ckpt_file = chatmodel_config.ckpt_file self.beam_width = chatmodel_config.beam_width self.dropout_rate = chatmodel_config.dropout_rate self.coverage_penalty_weight = chatmodel_config.coverage_penalty_weight self.length_penalty_weight = chatmodel_config.length_penalty_weight self.x = tf.placeholder(tf.int32, shape=[None, self.max_x_len], name='x') self.x_mask = tf.placeholder(tf.int32, shape=[None, self.max_x_len], name='x_mask') self.x_seg = tf.placeholder(tf.int32, shape=[None, self.max_x_len], name='x_seg') self.x_len = tf.placeholder(tf.int32, shape=[None], name='x_len') self.y = tf.placeholder(tf.int32, shape=[None, self.max_y_len], name='y') self.y_len = tf.placeholder(tf.int32, shape=[None], name='y_len') def create_model(self): self.bert_config = BertConfig.from_json_file(self.config_file) self.vocab_size = self.bert_config.vocab_size self.hidden_size = self.bert_config.hidden_size self.bert_model = BertModel(config=self.bert_config, input_ids=self.x, input_mask=self.x_mask, token_type_ids=self.x_seg, is_training=True, use_one_hot_embeddings=False) if self.ckpt_file is not None: tvars = tf.trainable_variables() self.assignment_map, self.initialized_variable_map = modeling.get_assignment_map_from_checkpoint( tvars, self.ckpt_file) X = self.bert_model.get_sequence_output() self.embeddings = self.bert_model.get_embedding_table() encoder_output = X[:, 1:, :] encoder_state = X[:, 0, :] batch_size = tf.shape(self.x)[0] start_token = tf.ones([batch_size], dtype=tf.int32) * self.vocab['<S>'] train_output = tf.concat([tf.expand_dims(start_token, 1), self.y], 1) output_emb = tf.nn.embedding_lookup(self.embeddings, train_output) output_len = self.y_len train_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper( output_emb, output_len, self.embeddings, 0.1) input_len = self.x_len - 2 cell = tf.contrib.rnn.GRUCell(num_units=self.hidden_size) def decode(scope): with tf.variable_scope(scope): attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=self.hidden_size, memory=encoder_output, memory_sequence_length=input_len) attention_cell = tf.contrib.seq2seq.AttentionWrapper( cell=cell, attention_mechanism=attention_mechanism, attention_layer_size=self.hidden_size) out_cell = MyOutputProjectionWrapper(attention_cell, self.vocab_size, self.embeddings, reuse=False) initial_state = out_cell.zero_state(dtype=tf.float32, batch_size=batch_size) initial_state = initial_state.clone(cell_state=encoder_state) decoder = tf.contrib.seq2seq.BasicDecoder( cell=out_cell, helper=train_helper, initial_state=initial_state) t_final_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.decode_max_len) with tf.variable_scope(scope, reuse=True): tiled_encoder_output = tf.contrib.seq2seq.tile_batch( encoder_output, multiplier=self.beam_width) tiled_encoder_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=self.beam_width) tiled_input_len = tf.contrib.seq2seq.tile_batch( input_len, multiplier=self.beam_width) attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=self.hidden_size, memory=tiled_encoder_output, memory_sequence_length=tiled_input_len) attention_cell = tf.contrib.seq2seq.AttentionWrapper( cell=cell, attention_mechanism=attention_mechanism, attention_layer_size=self.hidden_size) out_cell = MyOutputProjectionWrapper(attention_cell, self.vocab_size, self.embeddings, reuse=True) initial_state = out_cell.zero_state(dtype=tf.float32, batch_size=batch_size * self.beam_width) initial_state = initial_state.clone( cell_state=tiled_encoder_state) self.end_token = self.vocab['<T>'] beamDecoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=out_cell, embedding=self.embeddings, start_tokens=start_token, end_token=self.end_token, initial_state=initial_state, beam_width=self.beam_width, coverage_penalty_weight=self.coverage_penalty_weight, length_penalty_weight=self.length_penalty_weight) p_final_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=beamDecoder, output_time_major=False, maximum_iterations=self.decode_max_len) return t_final_output, p_final_output t_output, p_output = decode('decode') p_output = tf.identity(p_output.predicted_ids[:, :, 0], name='predictions') return t_output, p_output def loss(self): t_output, p_output = self.create_model() decode_len = tf.shape(t_output.sample_id)[-1] y_target = self.y[:, :decode_len] mask_len = tf.maximum(decode_len, self.y_len) y_mask = tf.sequence_mask(mask_len, self.max_y_len, dtype=tf.float32) y_mask = y_mask[:, :decode_len] loss = tf.contrib.seq2seq.sequence_loss(t_output.rnn_output, y_target, weights=y_mask) p_output_sparse = self._convert_tensor_to_sparse( p_output, self.end_token) y_output_sparse = self._convert_tensor_to_sparse( self.y, self.end_token) distance = tf.reduce_sum( tf.edit_distance(p_output_sparse, y_output_sparse, normalize=False)) return loss, distance, p_output, t_output.sample_id def _convert_tensor_to_sparse(self, a, end_token): indices = tf.where(tf.not_equal(a, 0) & tf.not_equal(a, end_token)) values = tf.gather_nd(a, indices) sparse_a = tf.SparseTensor(indices, values, tf.shape(a, out_type=tf.int64)) return sparse_a