def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot( label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot( label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_classification_loss(model_config, pool_output, class_label, n_class): with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[n_class, model_config.hidden_size], initializer=modeling.create_initializer( model_config.initializer_range)) output_bias = tf.get_variable("output_bias", shape=[n_class], initializer=tf.zeros_initializer()) logits = tf.matmul(pool_output, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) labels = tf.reshape(class_label, [-1]) one_hot_labels = tf.one_hot(labels, depth=n_class, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return loss, per_example_loss, log_probs
def __call__(self, x, y, sequence_length): x = tf.reshape(x, (-1, self.hidden_dim)) self.logits = tf.layers.dense(x, self.output_shape, activation=self.activation, kernel_initializer=modeling.create_initializer(self.initializer_range)) self.targets = tf.to_float(y) self.preds = tf.reshape(self.logits, [-1, self.max_length]) istarget = tf.to_float(tf.not_equal(self.targets, 0)) self.accuracy = tf.reduce_sum(tf.to_float(tf.square(tf.subtract(self.preds, self.targets))) * istarget) / ( tf.reduce_sum(istarget)) istargetv2 = tf.to_float(sequence_length) self.accuracy2 = tf.reduce_sum(tf.to_float(tf.square(tf.subtract(self.preds, self.targets))) * istargetv2) / ( tf.reduce_sum(istargetv2)) self.loss = self.loss_layer(self.logits, self.targets, sequence_length) # self.loss = self.crf_loss_layer(self.logits, self.targets, sequence_length) return
def get_mlm_output(input_tensor, albert_config, mlm_positions, output_weights, label_ids, label_weights): """From run_pretraining.py.""" input_tensor = gather_indexes(input_tensor, mlm_positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=albert_config.embedding_size, activation=modeling.get_activation(albert_config.hidden_act), kernel_initializer=modeling.create_initializer( albert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[albert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [1, -1]) one_hot_labels = tf.one_hot(label_ids, depth=albert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator masked_lm_log_probs = tf.reshape(log_probs, [-1, log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) # return masked_lm_predictions return loss, per_example_loss
def get_next_sentence_output(bert_config, input_tensor, labels): """Get loss and log probs for the next sentence prediction.""" # Simple binary classification. Note that 0 is "next sentence" and 1 is # "random sentence". This weight matrix is not used after pre-training. with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[2, bert_config.hidden_size], initializer=modeling.create_initializer(bert_config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[2], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) labels = tf.reshape(labels, [-1]) one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs)
def get_next_sentence_output(bert_config, input_tensor, labels, num_classes): """Get loss and log probs for the next sentence prediction.""" with tf.variable_scope('cls/seq_relationship'): output_weights = tf.get_variable( 'output_weights', shape=[num_classes, bert_config.hidden_size], initializer=modeling.create_initializer( bert_config.initializer_range)) output_bias = tf.get_variable('output_bias', shape=[num_classes], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) labels = tf.reshape(labels, [-1]) one_hot_labels = tf.one_hot(labels, depth=num_classes, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs)
def __init__(self, config, tf_dtype, input_hidden, embedding_table): # Keep variable names the same as BERT with tf.variable_scope("cls"): with tf.variable_scope("predictions"): with tf.variable_scope("transform"): self.transformed_output = tf.layers.dense( input_hidden, config.hidden_size, activation=modeling.get_activation(config.hidden_act), kernel_initializer=modeling.create_initializer( config.initializer_range)) self.transformed_output = modeling.layer_norm( self.transformed_output) output_bias = tf.Variable(tf.zeros([config.vocab_size], dtype=tf_dtype), name="output_bias", dtype=tf_dtype) self.final_output = tf.add( tf.matmul(self.transformed_output, tf.transpose(embedding_table)), output_bias) self.probs = tf.nn.softmax(self.final_output, name='token_probs')
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights): """From run_pretraining.py.""" input_tensor = gather_indexes(input_tensor, mlm_positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=albert_config.embedding_size, activation=modeling.get_activation(albert_config.hidden_act), kernel_initializer=modeling.create_initializer( albert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[albert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def get_shuffle_loss(model_config, seq_output, label_ids, label_weights): sequence_shape = modeling.get_shape_list(seq_output, expected_rank=[3]) seq_length = sequence_shape[1] width = sequence_shape[2] seq_output = tf.reshape(seq_output, [-1, width]) with tf.variable_scope("cls/shuffle"): with tf.variable_scope("transform"): seq_output = tf.layers.dense( seq_output, units=seq_length, activation=modeling.get_activation(model_config.hidden_act), kernel_initializer=modeling.create_initializer( model_config.initializer_range)) seq_output = modeling.layer_norm(seq_output) output_bias = tf.get_variable("output_bias", shape=[seq_length], initializer=tf.zeros_initializer()) logits = tf.nn.bias_add(seq_output, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(tf.cast(label_weights, tf.float32), [-1]) one_hot_labels = tf.one_hot(label_ids, depth=seq_length, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return loss, per_example_loss, log_probs
def init(max_sequence_length, bert_config_file, model_path, vocab_file): sess = tf.Session() tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) bert_config = modeling.BertConfig.from_json_file(bert_config_file) input_ids = tf.placeholder(tf.int32, shape=[None, max_sequence_length], name='input_ids') input_mask = tf.placeholder(tf.int32, shape=[None, max_sequence_length], name='input_mask') segment_ids = tf.placeholder(tf.int32, shape=[None, max_sequence_length], name='segment_ids') with sess.as_default(): model = modeling.BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False) output_layer = model.get_pooled_output() with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[2, bert_config.hidden_size], initializer=modeling.create_initializer(bert_config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[2], initializer=tf.zeros_initializer()) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probs = tf.nn.softmax(logits, axis=-1, name='probs') saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) saver.restore(sess, model_path) return sess, tokenizer
def build_model(self): from modeling import transformer_model, create_attention_mask_from_input_mask if self.is_training: dropout_prob = 0.1 else: dropout_prob = 0.0 attention1_mask = create_attention_mask_from_input_mask( self.sent1, self.sent1_mask) attention2_mask = create_attention_mask_from_input_mask( self.sent2, self.sent2_mask) # sent1 = transformer_model(self.sent1, attention1_mask, # hidden_size=768, num_hidden_layers=1, # intermediate_size=3072, # hidden_dropout_prob=dropout_prob, # attention_probs_dropout_prob=dropout_prob) # sent2 = transformer_model(self.sent2, attention2_mask, # hidden_size=768, num_hidden_layers=1, # intermediate_size=3072, # hidden_dropout_prob=dropout_prob, # attention_probs_dropout_prob=dropout_prob) sent1 = self.sent1 sent2 = self.sent2 d_vec = self.DCMN(sent1, sent2, self.sent1_mask, self.sent2_mask) gate = tf.layers.dense(tf.concat([d_vec, self.mark0], axis=1), 768, activation=tf.sigmoid, kernel_initializer=create_initializer(0.02)) refer_output = self.mark0 * gate + (1 - gate) * d_vec tf.keras.layers.BatchNormalization return refer_output
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) return log_probs
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]]) # TODO: dynamic gather from per_example_loss return loss
def get_sentence_direction_output(bert_config, input_tensor, labels): """Get loss and log probs for the sentence direction prediction.""" # Simple trinary classification. Note that # forward =1 # unrelated=2 # backward=0 with tf.variable_scope("cls/seq_direction2"): output_weights2 = tf.get_variable( "output_weights2", shape=[3, bert_config.hidden_size], initializer=modeling.create_initializer(bert_config.initializer_range)) output_bias2 = tf.get_variable( "output_bias2", shape=[3], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights2, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias2) log_probs = tf.nn.log_softmax(logits, axis=-1) labels = tf.reshape(labels, [-1]) one_hot_labels = tf.one_hot(labels, depth=3, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs)
def greedy_decode_8steps( self, cls_vector, # batch, 1, hid_size sequence_output): # batch, seq_len, hid_size hparams = self.hparams # When features into self.body() doesn't have 'targets' and 'theorem' # then we are in predict/infer mode. Since there is only a small # number of unrolling steps for the output, (1 for predicting theorem # and another 7 for the theorem premise), we build a static graph # to do greedy decode. # Here we cache the activations during decoding. # for each layer of the decoding transformer, we store # a tensor of size [batch, current_length, hidden_dim] # at first current_length = 0: cached_layers = [ tf.zeros_like(cls_vector[:, :0, :]) # [batch, 0, hid_size] for _ in range(hparams.num_decode_layers) ] # We also store all the premise prediction into a tensor # of shape [batch, current_length] premises = tf.zeros_like( cls_vector[:, :0, 0], # [batch, 0] dtype=tf.int32) # The first token to be processed is the CLS vector. decoder_input = cls_vector # Now we build the static unrolling of 8-step decoding, # each step update a new value for decoder_input for count in range(8): current_lengths = [ layer.shape.as_list()[1] for layer in cached_layers ] assert current_lengths[1:] == current_lengths[:-1] current_length = current_lengths[0] with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE): # cached_layers will be updated inside this method. # Feed this single token into the decoder transformer. output_vector = self.one_column_cached_transformer( decoder_input, # batch, 1, hid_size # list of num_hid_layers tensors, each of shape # [batch, current_length, hidden_size] cached_layers) # [batch, 1, hid_size] # After this step, all tensors in cached_layers # increased 1 in length: assert cached_layers[0].shape.as_list()[1] == current_length + 1 # Next the output vector is used to predict theorem # if we are at step 0, otherwise predict premise. with tf.variable_scope('prediction', reuse=tf.AUTO_REUSE): if count == 0: theorem_logits = tf.keras.layers.Dense( # [batch, 1, num_theorems] name='theorem', units=hparams.num_theorems, use_bias=True, kernel_initializer=modeling.create_initializer( hparams.initializer_range))(output_vector) theorem = tf.argmax( # [batch, 1] theorem_logits, # [batch, 1, num_theorems] axis=-1, output_type=tf.int32) else: premise_logits = tf.matmul( # batch, 1, seq_len a=output_vector, # [batch, 1, hid_size] b=sequence_output, # [batch, sequence_len, hid_size] transpose_b=True, ) # [batch, 1, sequence_len] premise = tf.argmax( # [batch, 1] premise_logits, # [batch, 1, seq_len] axis=-1, output_type=tf.int32) # [batch, current_len + 1] premises = tf.concat([premises, premise], axis=1) # [batch, 1, hid_size] decoder_input = premise_gather_nd(sequence_output, premise) continue # For theorem prediction, we need to go back to variable scope # decoder/embedding to get the new decoder_input with tf.variable_scope('decoder/embeddings', reuse=tf.AUTO_REUSE): # [batch, 1, hid_size] and [num_theorems, hid_size] # from the theorem_embedding lookup table. decoder_input, _ = modeling.embedding_lookup( input_ids=theorem, # [batch, 1] vocab_size=hparams.num_theorems, embedding_size=hparams.hidden_size, initializer_range=hparams.initializer_range, word_embedding_name='theorem_embedding', ) logits = dict( theorem=theorem, # [batch, 1] premises=premises) # [batch, 7] losses = dict(training=tf.constant(0.0)) return logits, losses
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): # input_tensor:[batch_size, seq_length, hidden_size] # positions:[batch_size, mask_num] # output_weights: [vocab_size, embedding_size] # -> input_tensor:[batch_size*mask_num, hidden_size] """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): # 在输出之前添加一个非线性变换,只在预训练阶段起作用 # new input_tensor:[batch_size*mask_num, hidden_size] input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) # new input_tensor:[batch_size*mask, hidden_size] input_tensor = modeling.layer_norm(input_tensor) tf.logging.info("input tensor shape after transform:{}".format( input_tensor.shape)) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. # output_bias:[vocab_size,] output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) # input_tensor:[batch_size*mask_num, hidden_size] # output weights: [vocab_size, embedding_size=hidden_size] # logits:[batch_size*mask_num, vocab_size] logits = tf.matmul(input_tensor, output_weights, transpose_b=True) # output_bias:[vocab_size] logits = tf.nn.bias_add(logits, output_bias) # log_probs:[batch_size*mask_num, vocab_size] log_probs = tf.nn.log_softmax(logits, axis=-1) #label_ids:[batch_size, mask_num] #new label_ids:[batch_size*mask_num, 1] label_ids = tf.reshape(label_ids, [-1]) #new label_weights:[batch_size*mask_num, 1] label_weights = tf.reshape(label_weights, [-1]) # one_hot_labels:[batch_size*mask_num, vocab_size] one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. # log_probs:[batch_size*mask_num, vocab_size] # one_hot_labels:[batch_size*mask_num, vocab_size] # per_example_loss:[batch_size*mask,] per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # cross-entropy loss # 乘以样本权重 #label_weights:[batch_size*mask, 1] numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 # 样本权重归一化后的loss loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_masked_span_output(bert_config, input_tensor, input_mask, positions, start_labels, end_labels, label_weights): """Get loss and log probs for the recurring span masking.""" sequence_shape = modeling.get_shape_list(input_tensor, expected_rank=3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] num_positions = modeling.get_shape_list(positions, expected_rank=2)[1] query_tensor = gather_indexes(input_tensor, positions) # [batch_size * num_positions, width] with tf.variable_scope("cls/span_predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("query_start_transform"): query_start_tensor = tf.layers.dense( query_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) query_start_tensor = modeling.layer_norm(query_start_tensor) with tf.variable_scope("query_end_transform"): query_end_tensor = tf.layers.dense( query_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) query_end_tensor = modeling.layer_norm(query_end_tensor) with tf.variable_scope("start_transform"): start_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) start_tensor = modeling.layer_norm(start_tensor) with tf.variable_scope("end_transform"): end_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) end_tensor = modeling.layer_norm(end_tensor) start_classifier = tf.get_variable( "start_classifier", shape=[bert_config.hidden_size, bert_config.hidden_size], initializer=modeling.create_initializer( bert_config.initializer_range)) end_classifier = tf.get_variable( "end_classifier", shape=[bert_config.hidden_size, bert_config.hidden_size], initializer=modeling.create_initializer( bert_config.initializer_range)) input_mask = tf.expand_dims(input_mask, axis=1) # [batch_size, 1, seq_length] adder = (1.0 - tf.cast(input_mask, tf.float32)) * -10000.0 temp = tf.matmul(query_start_tensor, start_classifier) # [batch_size * num_positions, width] temp = tf.reshape(temp, [batch_size, num_positions, width]) # [batch_size, num_positions, width] start_tensor = tf.transpose(start_tensor, perm=[0, 2, 1]) # [batch_size, width, seq_length] start_logits = tf.matmul(temp, start_tensor) # [batch_size, num_positions, seq_length] start_logits += adder start_logits = tf.reshape(start_logits, [batch_size * num_positions, seq_length]) temp = tf.matmul(query_end_tensor, end_classifier) # [batch_size * num_positions, width] temp = tf.reshape(temp, [batch_size, num_positions, width]) # [batch_size, num_positions, width] end_tensor = tf.transpose(end_tensor, perm=[0, 2, 1]) # [batch_size, width, seq_length] end_logits = tf.matmul(temp, end_tensor) # [batch_size, num_positions, seq_length] end_logits += adder end_logits = tf.reshape(end_logits, [batch_size * num_positions, seq_length]) label_weights = tf.reshape(label_weights, [-1]) # [batch_size * num_positions] start_log_probs = tf.nn.log_softmax(start_logits, axis=-1) # [batch_size * num_positions, seq_length] start_labels = tf.reshape(start_labels, [-1]) # [batch_size * num_positions] start_one_hot_labels = tf.one_hot( start_labels, depth=seq_length, dtype=tf.float32) # # [batch_size * num_positions, seq_length] start_per_example_loss = -tf.reduce_sum(start_log_probs * start_one_hot_labels, axis=[-1]) end_log_probs = tf.nn.log_softmax(end_logits, axis=-1) # [batch_size * num_positions, seq_length] end_labels = tf.reshape(end_labels, [-1]) # [batch_size * num_positions] end_one_hot_labels = tf.one_hot( end_labels, depth=seq_length, dtype=tf.float32) # # [batch_size * num_positions, seq_length] end_per_example_loss = -tf.reduce_sum(end_log_probs * end_one_hot_labels, axis=[-1]) per_example_loss = (start_per_example_loss + end_per_example_loss) / 2 numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return loss, per_example_loss
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] next_sentence_labels = features["next_sentence_labels"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs, student_masked_lm_logits) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs, student_next_sentence_logits) = get_next_sentence_output( bert_config, model.get_pooled_output(), next_sentence_labels) if FLAGS.distill: teacher_config = modeling.BertConfig.from_json_file( FLAGS.teacher_config_file) with tf.variable_scope("teacher"): teacher = modeling.BertModel( config=teacher_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # map every g layers of the teacher model to the student model g = int(teacher_config.num_hidden_layers / bert_config.num_hidden_layers) # project teacher hidden layers down to student hidden layers dims # with tf.variable_scope('loss'): teacher_hidden_layers = teacher.get_all_encoder_layers() hidden_loss = tf.add_n([ tf.reduce_sum( tf.squared_difference( tf.layers.dense( teacher_hidden_layers[i * g], units=bert_config.hidden_size, kernel_initializer=modeling.create_initializer( bert_config.initializer_range)), student_hidden)) for i, student_hidden in enumerate( model.get_all_encoder_layers()) ]) hidden_loss_same_size = tf.add_n([ tf.reduce_sum( tf.squared_difference(teacher_hidden_layers[i * g], student_hidden)) for i, student_hidden in enumerate(model.get_all_encoder_layers()) ]) embedding_loss = tf.reduce_mean( tf.squared_difference(teacher.get_embedding_output(), model.get_embedding_output())) attention_loss = tf.add_n([ tf.reduce_sum( tf.squared_difference(teacher.attention_scores[i * g], student_scores)) for i, student_scores in enumerate(model.attention_scores) ]) if FLAGS.pred_distill: with tf.variable_scope('teacher'): (teacher_masked_lm_loss, teacher_masked_lm_example_loss, teacher_masked_lm_log_probs, teacher_masked_lm_logits) = get_masked_lm_output( teacher_config, teacher.get_sequence_output(), teacher.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (teacher_next_sentence_loss, teacher_next_sentence_example_loss, teacher_next_sentence_log_probs, teacher_next_sentence_logits) = get_next_sentence_output( teacher_config, teacher.get_pooled_output(), next_sentence_labels) masked_lm_distill_loss = tf.reduce_mean( -tf.nn.softmax(teacher_masked_lm_logits) * tf.nn.log_softmax(student_masked_lm_logits)) next_sentence_distill_loss = tf.reduce_mean( tf.squared_difference(teacher_next_sentence_logits, student_next_sentence_logits)) total_loss = masked_lm_distill_loss else: total_loss = hidden_loss_same_size + embedding_loss + attention_loss + masked_lm_loss else: total_loss = masked_lm_loss + next_sentence_loss tvars = tf.trainable_variables() scaffold_fn = None checkpoints = [] assignment_maps = [] student_variable_names = {} teacher_variable_names = {} assert FLAGS.teacher_checkpoint or not FLAGS.distill if init_checkpoint: (assignment_map, student_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) checkpoints.append(init_checkpoint) assignment_maps.append(assignment_map) if FLAGS.teacher_checkpoint: (assignment_map, teacher_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, FLAGS.teacher_checkpoint, teacher=True) checkpoints.append(FLAGS.teacher_checkpoint) assignment_maps.append(assignment_map) if use_tpu: def tpu_scaffold(): for c, a in zip(checkpoints, assignment_maps): tf.logging.info("*** Loading vars from Checkpoint %s ***" % c) tf.train.init_from_checkpoint(c, a) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: for c, a in zip(checkpoints, assignment_maps): tf.logging.info("*** Loading vars from Checkpoint %s ***" % c) tf.train.init_from_checkpoint(c, a) output_spec = None var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'bert/') var_list += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'loss/') tf.logging.info("**** Trainable Variables ****") for var in var_list: tf.logging.info("name = %s, shape = %s", var.name, var.shape) if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, var_list) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) next_sentence_log_probs = tf.reshape( next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) next_sentence_predictions = tf.argmax(next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } if FLAGS.eval_teacher: eval_metrics = (metric_fn, [ teacher_masked_lm_example_loss, teacher_masked_lm_log_probs, masked_lm_ids, masked_lm_weights, teacher_next_sentence_example_loss, teacher_next_sentence_log_probs, next_sentence_labels ]) else: eval_metrics = (metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels ]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
def create_model(bert_config, is_training, input_ids1, input_mask1, segment_ids1, input_ids2, input_mask2, segment_ids2, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model1 = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids1, input_mask=input_mask1, token_type_ids=segment_ids1, use_one_hot_embeddings=use_one_hot_embeddings) sequence_output1 = model1.get_sequence_output() model2 = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings) sequence_output2 = model2.get_sequence_output() print("sequence_output1:{}".format(sequence_output1.shape)) print("sequence_output2:{}".format(sequence_output2.shape)) with tf.variable_scope('ESIM'): # 计算a_bar与b_bar每个词语之间的相似度 input_mask1 = tf.cast(input_mask1, tf.float32) input_mask2 = tf.cast(input_mask2, tf.float32) with tf.variable_scope('local_inference'): # attention_weight: [batch_size, seq_length1, seq_length2] attention_weight = tf.matmul( sequence_output1, tf.transpose(sequence_output2, [0, 2, 1])) # attention_weight_2: [batch_size, seq_length1, seq_length2] attention_weight_2 = tf.exp( attention_weight - tf.reduce_max(attention_weight, axis=2, keepdims=True)) attention_weight_2 = attention_weight_2 * tf.expand_dims( tf.cast(input_mask2, tf.float32), 1) # alpha: [batch_size, seq_length1, seq_length2] alpha = attention_weight_2 / ( tf.reduce_sum(attention_weight_2, -1, keepdims=True) + 1e-8) # sequence_output1_dual: [batch_size, seq_length1, hidden_size] sequence_output1_dual = tf.reduce_sum( tf.expand_dims(sequence_output2, 1) * tf.expand_dims(alpha, -1), 2) print("sequence_output1_dual:{}".format( sequence_output1_dual.shape)) sequence_output1_match = tf.concat([ sequence_output1, sequence_output1_dual, sequence_output1 * sequence_output1_dual, sequence_output1 - sequence_output1_dual ], 2) print("sequence_output1_match:{}".format( sequence_output1_match.shape)) # attention_weight_1: [batch_size, seq_length, seq_length] attention_weight_1 = attention_weight - tf.reduce_max( attention_weight, axis=1, keepdims=True) attention_weight_1 = tf.exp( tf.transpose(attention_weight_1, [0, 2, 1])) attention_weight_1 = attention_weight_1 * tf.expand_dims( tf.cast(input_mask1, tf.float32), 1) # beta: [batch_size, seq_length, seq_length] beta = attention_weight_1 / ( tf.reduce_sum(attention_weight_1, -1, keepdims=True) + 1e-8) # sequence_output2_dual: [batch_size, seq_length, hidden_size] sequence_output2_dual = tf.reduce_sum( tf.expand_dims(sequence_output1, 1) * tf.expand_dims(beta, -1), 2) print("sequence_output2_dual:{}".format( sequence_output2_dual.shape)) sequence_output2_match = tf.concat([ sequence_output2, sequence_output2_dual, sequence_output2 * sequence_output2_dual, sequence_output2 - sequence_output2_dual ], 2) print("sequence_output2_match:{}".format( sequence_output2_match.shape)) # high dimension to low dimension with tf.variable_scope("projection", reuse=tf.AUTO_REUSE): output_layer1 = tf.layers.dense( sequence_output1_match, bert_config.hidden_size, name='dense', activation=tf.nn.tanh, kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) output_layer1 = modeling.layer_norm(output_layer1, name="layer_norm") print("output_layer1:{}".format(output_layer1.shape)) output_layer2 = tf.layers.dense( sequence_output2_match, bert_config.hidden_size, name='dense', reuse=True, activation=tf.nn.tanh, kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) output_layer2 = modeling.layer_norm(output_layer2, name="layer_norm") print("output_layer2:{}".format(output_layer2.shape)) if is_training: output_layer1 = tf.nn.dropout(output_layer1, keep_prob=0.9) output_layer2 = tf.nn.dropout(output_layer2, keep_prob=0.9) with tf.variable_scope("composition", reuse=tf.AUTO_REUSE): # output_layer1 = tf.reduce_sum(output_layer1 * tf.expand_dims(tf.cast(input_mask1, tf.float32), -1), # 1) / tf.expand_dims(tf.reduce_sum(tf.cast(input_mask1, tf.float32), 1), 1) logit_x1_sum = tf.reduce_sum(output_layer1 * tf.expand_dims(input_mask1, -1), 1) / \ tf.expand_dims(tf.reduce_sum(input_mask1, 1), 1) logit_x1_max = tf.reduce_max( output_layer1 * tf.expand_dims(input_mask1, -1), 1) logit_x2_sum = tf.reduce_sum(output_layer2 * tf.expand_dims(input_mask2, -1), 1) / \ tf.expand_dims(tf.reduce_sum(input_mask2, 1), 1) logit_x2_max = tf.reduce_max( output_layer2 * tf.expand_dims(input_mask2, -1), 1) logit = tf.concat( [logit_x1_sum, logit_x1_max, logit_x2_sum, logit_x2_max], 1) print("logit:{}".format(logit.shape)) """ 一下 接双输出,相互影响 """ # with tf.variable_scope("output1"): # output_layer1 = tf.reduce_sum(output_layer1 * tf.expand_dims(tf.cast(input_mask1, tf.float32), -1), # 1) / tf.expand_dims(tf.reduce_sum(tf.cast(input_mask1, tf.float32), 1), 1) # # output_weights1 = tf.get_variable( # "finetune_weights", [bert_config.hidden_size, num_labels], # initializer=tf.truncated_normal_initializer(stddev=0.02)) # # output_bias1 = tf.get_variable( # "finetune_bias", [num_labels], initializer=tf.zeros_initializer()) # # logits1 = tf.matmul(output_layer1, output_weights1) # logits1 = tf.nn.bias_add(logits1, output_bias1) # probabilities1 = tf.nn.sigmoid(logits1) # # with tf.variable_scope("output2"): # output_layer2 = tf.reduce_sum(output_layer2 * tf.expand_dims(tf.cast(input_mask2, tf.float32), -1), # 1) / tf.expand_dims(tf.reduce_sum(tf.cast(input_mask2, tf.float32), 1), 1) # # output_weights2 = tf.get_variable( # "finetune_weights", [bert_config.hidden_size, num_labels], # initializer=tf.truncated_normal_initializer(stddev=0.02)) # # output_bias2 = tf.get_variable( # "finetune_bias", [num_labels], initializer=tf.zeros_initializer()) # # logits2 = tf.matmul(output_layer2, output_weights2) # logits2 = tf.nn.bias_add(logits2, output_bias2) # probabilities2 = tf.nn.sigmoid(logits2) logit = tf.layers.dense(logit, bert_config.hidden_size, name='dense', activation=tf.nn.tanh, kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) logit = modeling.layer_norm(logit, name="layer_norm") print("logit:{}".format(logit.shape)) if is_training: logit = tf.nn.dropout(logit, keep_prob=0.9) hidden_size = logit.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(logit, keep_prob=0.9) logits = tf.matmul(logit, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probabilities)
def create_model( config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings, task_name, ): """Creates a classification model from_scratch.""" _true_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), dtype=tf.int32) with tf.variable_scope("baseline"): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (word_embedding_output, output_embedding_table) = modeling.embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.embedding_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. embedding_output = modeling.embedding_postprocessor( input_tensor=word_embedding_output, use_token_type=True, token_type_ids=segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope("bilstm"): sequence_output = modeling.bilstm_fused( inputs=embedding_output, sequence_lengths=_true_length, lstm_size=config.lstm_size, bilstm_dropout_rate=config.bilstm_dropout_rate, is_training=is_training, num_layers=config.num_bilstm) # first_token_tensor = tf.squeeze(sequence_output[:, -1:, :], axis=1) last_token_tensor = tf.squeeze(sequence_output[:, -1:, :], axis=1) output_layer = tf.layers.dense( last_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range)) hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) if task_name != "sts-b": probabilities = tf.nn.softmax(logits, axis=-1) predictions = tf.argmax(probabilities, axis=-1, output_type=tf.int32) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) else: probabilities = logits logits = tf.squeeze(logits, [-1]) predictions = logits per_example_loss = tf.square(logits - labels) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, probabilities, logits, predictions)
def add_embeddings(self): with tf.name_scope("embedding"): if self.is_Embedding_Needed: W = tf.Variable(np.array(self.embeddings), name="word_embed", dtype="float32", trainable=self.trainable) else: W = tf.get_variable( name='word_embed', shape=[self.vocab_size, self.embedding_size], initializer=modeling.create_initializer(0.02), trainable=True) if 'adding_problem' not in self.dataset: self.embedding_W = W self.embedded_chars_q = tf.nn.embedding_lookup( self.embedding_W, self.question) else: #mapping 2 dim into high dim if self.embedding_size == 2: self.embedded_chars_q = self.question else: self.embedded_chars_q = tf.layers.dense( self.question, self.embedding_size) print('embedded_chars_q:', self.embedded_chars_q) if 'adding_problem' not in self.dataset: self.embedded_chars_q = modeling.layer_norm( tf.nn.dropout(self.embedded_chars_q, keep_prob=1.0 - self.input_dropout_prob)) context_position = tf.range(self.max_input_left, dtype=tf.int32)[:, None] memory_postion = tf.range(self.max_input_left, dtype=tf.int32)[None, :] relative_position = memory_postion - context_position rp_bucket = relative_position_bucket(relative_position, num_buckets=self.t5_bucket, max_distance=self.t5_max_distance) #why this embedding is very sensitive... self.t5_pos_embedding = tf.get_variable( 't5_pos_mat', [self.t5_bucket, self.config.num_attention_heads], initializer=modeling.create_initializer(0.02), trainable=True) self.single_t5_att_bias = compute_bias(rp_bucket, self.t5_pos_embedding) ## [batch, num_heads, query_length, memory_length] self.t5_att_bias = tf.tile(self.single_t5_att_bias, [tf.shape(self.question)[0], 1, 1, 1]) print('t5_bias:', self.t5_att_bias) ''' @2020/9/7 we can directly add the head mask during inference ''' head_mask = np.zeros((self.config.num_attention_heads, self.max_input_left, self.max_input_left)) #high2low=[3,1,4,0,5,2] low2high = [2, 5, 0, 4, 1, 3] for tt in range(6): print('tt:', tt) head_mask[low2high[tt], :, :] = np.ones( (self.max_input_left, self.max_input_left)) self.t5_att_bias = self.t5_att_bias * tf.constant( head_mask, tf.float32)
def body(self, features): hparams = self.hparams if not self.is_training: hparams.dropout_prob = 0.0 with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE): # attention_weights: [batch, n_head, from_len, to_len] sequence_output, cls_vector, attention_weights = self.build_encoder( features) if 'targets' not in features: assert self.hparams.dropout_prob == 0.0 logits, losses = self.greedy_decode_8steps(cls_vector, sequence_output) logits.update(attention_weights=attention_weights[:, :, 0, :]) return logits, losses with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE): with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE): premise = features[ 'targets'] # [batch, premise_len=8] -bad naming:( # [batch, premise_len, hid_size] premise_vecs = premise_gather_nd(sequence_output, premise) batch_size = tf.shape(premise)[0] premise_len = premise.shape.as_list()[-1] theorem = features['theorem'] # batch, 1 # [batch, 1, hid_size] and [num_theorems, hid_size] theorem_vec, theorem_emb_table = modeling.embedding_lookup( input_ids=theorem, # [batch, 1] vocab_size=hparams.num_theorems, embedding_size=hparams.hidden_size, initializer_range=hparams.initializer_range, word_embedding_name='theorem_embedding', ) depth = features['depth'] # batch, 1 decoder_input = tf.concat( [ cls_vector, # [batch, 1, hid_size] theorem_vec, # [batch, 1, hid_size] premise_vecs[:, : -1, :] # [batch, premise_len-1, hid_size] ], axis=1) # [batch, premise_len + 1, hid_size] decode_length = decoder_input.shape.as_list()[1] assert decode_length == premise_len + 1 # [decode_length, hid_size] pos_embedding, _ = modeling.embedding_lookup( input_ids=tf.range(decode_length), # [decode_length] vocab_size=hparams.max_premise, # >= premise_len embedding_size=hparams.hidden_size, initializer_range=hparams.initializer_range, word_embedding_name='positional_embedding', ) pos_embedding = tf.reshape( pos_embedding, [1, decode_length, hparams.hidden_size]) decoder_input = modeling.layer_norm_and_dropout( decoder_input + # [batch, decode_length, hid_size] pos_embedding, # [1, decode_length, hid_size] hparams.dropout_prob) # [batch, decode_length, hid_size] with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE): causal_attention_mask = t2t_model.common_layers.ones_matrix_band_part( rows=decode_length, cols=decode_length, num_lower=-1, # attend to everything before num_upper=0, # attend to nothing after out_shape=[1, decode_length, decode_length ]) # 1, decode_length, decode_length # [batch, decode_length, decode_length] causal_attention_mask = tf.tile(causal_attention_mask, [batch_size, 1, 1]) all_decoder_layers = modeling.transformer_model( input_tensor=decoder_input, attention_mask=causal_attention_mask, hidden_size=hparams.hidden_size, num_hidden_layers=hparams.num_decode_layers, num_attention_heads=hparams.num_attention_heads, intermediate_size=hparams.intermediate_size, intermediate_act_fn=modeling.get_activation( hparams.hidden_act), hidden_dropout_prob=hparams.dropout_prob, attention_probs_dropout_prob=hparams.dropout_prob, initializer_range=hparams.initializer_range, do_return_all_layers=True, attention_top_k=hparams.attention_top_k) decoder_output, _ = all_decoder_layers[ -1] # [batch, dec_len, hid_size] theorem_feature = decoder_output[:, 0, :] # [batch, hid_size] premise_feature = decoder_output[:, 1:, :] # [batch, tar_len, hid_size] with tf.variable_scope('prediction', reuse=tf.AUTO_REUSE): theorem_logits = tf.keras.layers.Dense( # [batch, num_theorems] name='theorem', units=hparams.num_theorems, use_bias=True, kernel_initializer=modeling.create_initializer( hparams.initializer_range))(theorem_feature) premise_logits = tf.matmul( a=premise_feature, # [batch, premise_len, hid_size] b=sequence_output, # [batch, sequence_len, hid_size] transpose_b=True, ) # [batch, premise_len, sequence_len] # [batch * premise_len, sequence_len] seq_len = premise_logits.shape.as_list()[-1] premise_logits = tf.reshape(premise_logits, [-1, seq_len]) premise_weights = tf.cast(premise > 0, tf.float32) # [batch, prem_len] premise_weights = tf.reshape(premise_weights, [-1]) # [batch * prem_len] premise = tf.reshape(premise, [-1, 1]) # [batch * prem_len, 1] theorem_loss = tf.losses.sparse_softmax_cross_entropy( labels=theorem, # [batch, 1] logits=theorem_logits # [batch, num_theorems] ) premise_loss = tf.losses.sparse_softmax_cross_entropy( labels=premise, # [batch * premise_len, 1] logits=premise_logits, # [batch * premise_len, sequence_len] weights=premise_weights # [batch * premise_len] ) logits = dict(theorem_logits=theorem_logits, theorem_labels=theorem, premise_logits=premise_logits, premise_labels=premise) losses = dict(training=theorem_loss + premise_loss, theorem_loss=theorem_loss, premise_loss=premise_loss) return logits, losses
tensor_name=None, all_tensors=False) chkp.print_tensors_in_checkpoint_file( "output_dir_dupe5_s4_3class/model_795_seq_direction.ckpt", tensor_name='cls/seq_direction/output_weights', all_tensors=False) chkp.print_tensors_in_checkpoint_file( "output_dir_dupe5_s4/model.ckpt-795000", tensor_name='cls/seq_direction/output_bias2', all_tensors=False) with tf.variable_scope("cls/seq_direction"): output_weights2 = tf.get_variable( "output_weights2", shape=[1, 768], initializer=modeling.create_initializer(.02)) output_bias2 = tf.get_variable("output_bias2", shape=[1], initializer=tf.zeros_initializer()) # output_weights = chkp.print_tensors_in_checkpoint_file("uncased_L-12_H-768_A-12/bert_model.ckpt", # tensor_name='cls/seq_relationship/output_weights', # all_tensors=False) # output_dir_position_pretrain_tf/model.ckpt-34000 bert/embeddings/dependency_embedding # output_bias = chkp.print_tensors_in_checkpoint_file("uncased_L-12_H-768_A-12/bert_model.ckpt", # tensor_name='cls/seq_relationship/output_bias', all_tensors=False) #reader = pywrap_tensorflow.NewCheckpointReader("uncased_L-12_H-768_A-12/bert_model.ckpt") #add new embeddinf # reader.get_tensor('bert/embeddings/token_type_embeddings') ##add task 3
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1])
def gec_create_model(bert_config, is_training, input_sequence, input_mask, segment_ids, edit_sequence, use_one_hot_embeddings, mode, copy_weight, use_bert_more, insert_ids, multitoken_insert_ids, subtract_replaced_from_replacement): """Creates a classification model.""" # insert_ids: word ids of unigram inserts (list) # multitoken_insert_ids: word_ids of bigram inserts (list of tuples of length 2) # Defining the space of all possible edits: # unk, sos and eos are dummy edits mapped to 0, 1 and 2 respectively # copy is mapped to 3 # del is mapped to 4 num_appends = len(insert_ids) + len(multitoken_insert_ids) num_replaces = num_appends # appends and replacements come from the same set (inserts and multitoken_inserts) append_begin = 5 # First append edit (mapped to 5) append_end = append_begin + num_appends - 1 #Last append edit rep_begin = append_end + 1 # First replace edit rep_end = rep_begin + num_replaces - 1 #Last replace edit num_suffix_transforms = 58 #num of transformation edits num_labels = 5 + num_appends + num_replaces + num_suffix_transforms # total number of edits print("************ num of labels : {} ***************".format(num_labels)) config = bert_config input_sequence_shape = modeling.get_shape_list(input_sequence,2) batch_size = input_sequence_shape[0] seq_len = input_sequence_shape[1] if not use_bert_more: #default use of bert (without logit factorisation) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_sequence, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() else: # LOGIT FACTORISATION is On! model = modified_modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_sequence, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() replace_layer = output_layer[:,seq_len:2*seq_len,:] #representation of replacement slots as described in paper append_layer = output_layer[:,2*seq_len:3*seq_len,:] #representation of append slots as described in paper output_layer = output_layer[:,0:seq_len,:] output_layer_shape = modeling.get_shape_list(output_layer,3) hidden_size = output_layer_shape[-1] flattened_output_layer = tf.reshape(output_layer,[-1, hidden_size]) h_edit = flattened_output_layer if use_bert_more: h_word = flattened_output_layer flattened_replace_layer = tf.reshape(replace_layer,[-1, hidden_size]) flattened_append_layer = tf.reshape(append_layer, [-1, hidden_size]) m_replace = flattened_replace_layer m_append = flattened_append_layer with tf.variable_scope("cls/predictions"): with tf.variable_scope("transform"): h_word = tf.layers.dense( h_word, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) h_word = modeling.layer_norm(h_word) with tf.variable_scope("cls/predictions",reuse=True): with tf.variable_scope("transform",reuse=True): m_replace = tf.layers.dense( m_replace, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) m_replace = modeling.layer_norm(m_replace) with tf.variable_scope("cls/predictions",reuse=True): with tf.variable_scope("transform",reuse=True): m_append = tf.layers.dense( m_append, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) m_append = modeling.layer_norm(m_append) word_embedded_input = model.word_embedded_input flattened_word_embedded_input = tf.reshape(word_embedded_input, [-1, hidden_size]) labels = edit_sequence edit_weights = tf.get_variable( "edit_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) if is_training: h_edit = tf.nn.dropout(h_edit, keep_prob=0.9) if use_bert_more: # append/replace weight vector for a given append or replace operation # correspond to word embedding for its token argument # for multitoken append/replace (e.g. has been) # weight vector is sum of word embeddings of token arguments append_weights = edit_word_embedding_lookup(model.embedding_table, insert_ids, use_one_hot_embeddings, config.vocab_size, config.hidden_size) replace_weights = append_weights #tokens in replace and append vocab are same #(i.e. inserts and multitoken_inserts) multitoken_append_weights = wem_utils.edit_embedding_loopkup(model.embedding_table, multitoken_insert_ids, use_one_hot_embeddings, config.vocab_size, config.hidden_size) multitoken_replace_weights = multitoken_append_weights #tokens in replace and append vocab are same #(i.e. inserts and multitoken_inserts) append_weights = tf.concat([append_weights, multitoken_append_weights],0) replace_weights = tf.concat([replace_weights, multitoken_replace_weights],0) with tf.variable_scope("loss"): edit_logits = tf.matmul(h_edit, edit_weights, transpose_b=True) #first term in eq3 in paper logits = edit_logits if use_bert_more: #=============== inplace_word_logits==============# #2nd term in eq3 in paper inplace_logit = tf.reduce_sum(h_word * flattened_word_embedded_input, axis=1, keepdims=True) #copy #inplace_logit = tf.reduce_sum(m_replace * flattened_word_embedded_input, axis=1, keepdims=True) #copy inplace_logit_appends = tf.tile(inplace_logit,[1,num_appends]) inplace_logit_transforms = tf.tile(inplace_logit,[1,num_suffix_transforms]) zero_3_logits = tf.zeros([batch_size*seq_len,3]) #unk sos eos zero_1_logits = tf.zeros([batch_size*seq_len,1]) # del zero_replace_logits = tf.zeros([batch_size*seq_len,num_replaces]) concat_list = [zero_3_logits, inplace_logit, zero_1_logits]\ + [inplace_logit_appends]\ + [zero_replace_logits]\ + [inplace_logit_transforms] inplace_word_logits = tf.concat(concat_list,1) #======additional (insert,replace) logits ====# #3rd term in eqn3 in paper zero_5_logits = tf.zeros([batch_size*seq_len,5]) append_logits = tf.matmul(m_append, append_weights, transpose_b=True) if subtract_replaced_from_replacement: replace_logits = replacement_minus_replaced_logits(m_replace, flattened_word_embedded_input, replace_weights) else: replace_logits = tf.matmul(m_replace, replace_weights, transpose_b=True) suffix_logits = tf.zeros([batch_size*seq_len,num_suffix_transforms]) concat_list = [zero_5_logits, append_logits, replace_logits, suffix_logits] additional_logits = tf.concat(concat_list,1) #====================================================# logits = edit_logits + inplace_word_logits + additional_logits logits_bias = tf.get_variable("output_bias", shape=[num_labels], initializer=tf.zeros_initializer()) logits += logits_bias logits = tf.reshape(logits, [output_layer_shape[0], output_layer_shape[1], num_labels]) log_probs = tf.nn.log_softmax(logits, axis=-1) probs = tf.nn.softmax(logits,axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) per_token_loss = per_token_loss * tf.to_float(input_mask) mask = copy_weight*tf.to_float(tf.equal(labels,3)) + tf.to_float(tf.not_equal(labels,3)) masked_per_token_loss = per_token_loss * mask per_example_loss = tf.reduce_sum(masked_per_token_loss, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probs)
def build_model(self): with tf.variable_scope("inferring_module"): rdim = 768 update_num = self.update_num batch_size = tf.shape(self.sent1)[0] dim = self.sent1.get_shape().as_list()[-1] gru_layer = BiGRU(num_layers=1, num_units=rdim, batch_size=batch_size, input_size=dim, keep_prob=0.9, is_train=self.is_training, activation=tf.nn.tanh) sent1_len = tf.cast(tf.reduce_sum(self.sent1_mask, axis=1), tf.int32) sent2_len = tf.cast(tf.reduce_sum(self.sent2_mask, axis=1), tf.int32) self.sent1 = gru_layer(self.sent1, sent1_len) self.sent2 = gru_layer(self.sent2, sent2_len) sr_cell = GRUCell(num_units=2 * rdim, activation=tf.nn.relu) r_cell = sr_cell tri_cell = DoubleJointCell(num_units=2 * rdim, r_cell=r_cell, sent1=self.sent1, sent2=self.sent2, dim=2 * dim, update_num=update_num, use_bias=False, activation=tf.tanh, dropout_rate=self.dropout_rate, sent1_mask=self.sent1_mask, sent2_mask=self.sent2_mask, initializer=None, dtype=tf.float32) fake_input = tf.tile(tf.expand_dims(self.mark0, axis=1), [1, update_num, 1]) self.init_state = tri_cell.zero_state(batch_size=batch_size, dtype=tf.float32) self.double_output, last_state = dynamic_rnn( cell=tri_cell, inputs=fake_input, initial_state=self.init_state) refer_output = tf.reduce_mean(self.double_output, axis=1) # (B, dim) # temp = tf.concat([refer_output, self.mark0], axis=1) # # temp = dropout(temp, self.dropout_rate) refer_output = tf.layers.dense( refer_output, 768, activation=tf.nn.tanh, kernel_initializer=create_initializer(0.02)) # return refer_output * (1 - gate) + gate * self.mark0 return refer_output + self.mark0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" #bert_config = bert_config, input_tensor = model.get_sequence_output(), output_weights = model.get_embedding_table(), positions = masked_lm_positions, label_ids = masked_lm_ids, label_weights = masked_lm_weights # postions参见create_pretraining_data.py中的masked_lm_postions # label_ids参见create_pretraining_data.py中的masked_lm_labels import ipdb ipdb.set_trace() # 在计算mlm的时候,先得到整个句子的向量,然后从整个句子的向量选出masked的那15%位置的向量,然后计算损失。因此,有%10的mask要保持不变。否则,根本就不会包含正确的masked的单词,因为那其它85%的单词只参与理解,不参与损失函数的计算。 # 有10%的mask要替换可能是为了要提高编码器的纠错能力,因为正常的句子中,也可能粗线错误的单词 # 有80%的呗mask掉主要是锻炼理解能力,能够根据上下文理解当前文本的意思 input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) #input_tensor.shpae=(160,768),output_weights.shape=(21128(vocab_size),768) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) #logits.shape=(160,21128) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) #label_ids.shape = (8,20) label_ids = tf.reshape(label_ids, [-1]) #label_ids.shape = (160) #label_weights.shape=(8,20) label_weights = tf.reshape(label_weights, [-1]) #label_weights是mask的权重, #在本程序中,都是1 #label_weights.shape=(160,) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) #one_hot_labels.shape=(160,21128),一共160个字符,每个字符用vocab_size的 #one_hot表示,为下文求loss做准备。 # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str, config: dict): """ :param model:BertModel Pytorch model instance to be converted :param ckpt_dir: Tensorflow model directory :param model_name: model name :return: Currently supported HF models: Y BertModel N BertForMaskedLM N BertForPreTraining N BertForMultipleChoice N BertForNextSentencePrediction N BertForSequenceClassification N BertForQuestionAnswering """ tensors_to_transpose = ( "dense.weight", "attention.self.query", "attention.self.key", "attention.self.value" ) var_map = ( ('layer.', 'layer_'), ('word_embeddings.weight', 'word_embeddings'), ('position_embeddings.weight', 'position_embeddings'), ('token_type_embeddings.weight', 'token_type_embeddings'), ('.', '/'), ('LayerNorm/weight', 'LayerNorm/gamma'), ('LayerNorm/bias', 'LayerNorm/beta'), ('weight', 'kernel') ) if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) state_dict = model.state_dict() def to_tf_var_name(name: str): for patt, repl in iter(var_map): name = name.replace(patt, repl) return 'bert/{}'.format(name) def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): tf_dtype = tf.dtypes.as_dtype(tensor.dtype) tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) session.run(tf.variables_initializer([tf_var])) session.run(tf_var) return tf_var tf.reset_default_graph() with tf.Session() as session: for var_name in state_dict: tf_name = to_tf_var_name(var_name) torch_tensor = state_dict[var_name].numpy() if 'token_type_embeddings' in tf_name: torch_tensor = np.tile(torch_tensor, [config['type_vocab_size'], 1]) if 'word_embeddings' in tf_name: add_emb_shape = config['vocab_size'] - torch_tensor.shape[0] embedding_table = tf.get_variable(name='additional_emb', shape=[add_emb_shape, torch_tensor.shape[1]], initializer=create_initializer(config['initializer_range'])) embedding_table.initializer.run() additional_emb = embedding_table.eval() torch_tensor = np.concatenate([torch_tensor, additional_emb], axis=0) if any([x in var_name for x in tensors_to_transpose]): torch_tensor = torch_tensor.T tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) tf.keras.backend.set_value(tf_var, torch_tensor) tf_weight = session.run(tf_var) print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) saver = tf.train.Saver(tf.trainable_variables()) saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
def build_model(self): from layers.ParallelInfo import TextCNN, RNNExtract, InteractionExtract, SingleSentenceExtract with tf.variable_scope("inferring_module"), tf.device("/device:GPU:0"): rdim = 768 batch_size = tf.shape(self.sent1)[0] sent_length = self.all_sent.get_shape().as_list()[1] update_num = 3 dim = self.sent1.get_shape().as_list()[-1] gru_layer = BiGRU(num_layers=1, num_units=rdim, batch_size=batch_size, input_size=dim, keep_prob=0.9, is_train=self.is_training, activation=tf.nn.tanh) seq_len = tf.reduce_sum(self.input_mask, axis=1) gru_output = gru_layer(self.all_sent, seq_len=seq_len) with tf.variable_scope("att"): all_seq_len = self.all_sent.get_shape().as_list()[1] cls = tf.tile(tf.expand_dims(self.mark0, axis=1), [1, all_seq_len, 1]) cat_att = tf.concat([cls, gru_output], axis=2) res = tf.layers.dense(cat_att, units=512, activation=tf.nn.relu) res = tf.layers.dense(res, units=1, use_bias=False) res_mask = tf.expand_dims(tf.cast(self.input_mask, tf.float32), axis=2) res = res - (1 - res_mask) * 10000.0 alpha = tf.nn.softmax(res, 1) gru_vec = tf.reduce_sum(alpha * gru_output, axis=1) # gru_vec = dropout(gru_vec, self.dropout_rate) gru_vec = tf.layers.dense( gru_vec, 768, activation=gelu, kernel_initializer=create_initializer(0.02)) gru_vec = dropout(gru_vec, self.dropout_rate) gru_vec = layer_norm(gru_vec + self.mark0) gru_vec = tf.layers.dense( gru_vec, 768, activation=tf.tanh, kernel_initializer=create_initializer(0.02)) text_cnn = TextCNN(2 * rdim, [1, 2, 3, 4, 5, 7], 128) img_ext = InteractionExtract(num_units=256, seq_len=sent_length) text_vec = text_cnn(gru_output, mask=self.input_mask) # rnn_vec, rnn_att = rnn_ext(self.all_sent, input_mask=self.input_mask, mark0=self.mark0) img_vec = img_ext(gru_output, self.sent1_mask, self.sent2_mask, self.dropout_rate) temp_res = tf.concat([img_vec, gru_vec, text_vec], axis=1) return tf.layers.dense(temp_res, 768, tf.tanh, kernel_initializer=create_initializer(0.02))
def create_model_old(bert_config, is_training, input_ids_1, input_mask_1, segment_ids_1, input_ids_2, input_mask_2, segment_ids_2, labels, keep_prob, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model_1 = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids_1, input_mask=input_mask_1, token_type_ids=segment_ids_1, use_one_hot_embeddings=use_one_hot_embeddings, scope="bert") model_2 = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids_2, input_mask=input_mask_2, token_type_ids=segment_ids_2, use_one_hot_embeddings=use_one_hot_embeddings, scope="bert") # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer_1 = model_1.get_pooled_output() print(output_layer_1.shape) output_layer_2 = model_2.get_pooled_output() print(output_layer_2.shape) # 最后进行拼接(前面也可以新增一些其他网络层) output_layer = tf.concat([output_layer_1, output_layer_2], axis=-1) # 最后进行拼接(前面也可以新增一些其他网络层) output_layer = tf.layers.dense( output_layer, bert_config.hidden_size, activation=tf.nn.relu, kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) hidden_size = output_layer.shape[-1].value print(output_layer.shape) output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=keep_prob) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probabilities)