def get_diff_loss(bert_config, input_tensor, masked_lm_positions, masked_lm_weights, loss_base, loss_target): base_prob = tf.exp(-loss_base) target_prob = tf.exp(-loss_target) prob_diff = base_prob - target_prob input_tensor = bc.gather_indexes(input_tensor, masked_lm_positions) with tf.compat.v1.variable_scope("diff_loss"): hidden = bc.dense(bert_config.hidden_size, bc.create_initializer(bert_config.initializer_range), bc.get_activation( bert_config.hidden_act))(input_tensor) logits = bc.dense(1, bc.create_initializer( bert_config.initializer_range))(hidden) logits = tf.reshape(logits, prob_diff.shape) per_example_loss = tf.abs(prob_diff - logits) per_example_loss = tf.cast(masked_lm_weights, tf.float32) * per_example_loss losses = tf.reduce_sum(per_example_loss, axis=1) loss = tf.reduce_mean(losses) return loss, per_example_loss, logits
def get_masked_lm_output_albert(model_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = bert_common.gather_indexes(input_tensor, positions) with tf.compat.v1.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.compat.v1.variable_scope("transform"): input_tensor = tf.keras.layers.Dense( model_config.embedding_size, activation=bert_common.get_activation(model_config.hidden_act), kernel_initializer=bert_common.create_initializer( model_config.initializer_range))(input_tensor) input_tensor = bert_common.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.compat.v1.get_variable( "output_bias", shape=[model_config.vocab_size], initializer=tf.compat.v1.zeros_initializer()) print("output_weights", output_weights.shape) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=model_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum( input_tensor=log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(input_tensor=label_weights * per_example_loss) denominator = tf.reduce_sum(input_tensor=label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def train_modeling(self, input_tensor, masked_lm_positions, masked_lm_weights, loss_base, loss_target): if self.graph_built: raise Exception() batch_size, _, hidden_dims = get_shape_list(input_tensor) input_tensor = bc.gather_indexes(input_tensor, masked_lm_positions) input_tensor = tf.reshape(input_tensor, [batch_size, -1, hidden_dims]) with tf.compat.v1.variable_scope("project"): hidden = self.layer1(input_tensor) def cross_entropy(logits, loss_label): gold_prob = loss_to_prob_pair(loss_label) logits = tf.reshape(logits, gold_prob.shape) per_example_loss = tf.nn.softmax_cross_entropy_with_logits( gold_prob, logits, axis=-1, name=None) per_example_loss = tf.cast(masked_lm_weights, tf.float32) * per_example_loss losses = tf.reduce_sum(per_example_loss, axis=1) loss = tf.reduce_mean(losses) return loss, per_example_loss with tf.compat.v1.variable_scope("cls1"): self.logits1 = self.logit_dense1(hidden) with tf.compat.v1.variable_scope("cls2"): self.logits2 = self.logit_dense2(hidden) self.loss1, self.per_example_loss1 = cross_entropy( self.logits1, loss_base) self.loss2, self.per_example_loss2 = cross_entropy( self.logits2, loss_target) self.prob1 = tf.nn.softmax(self.logits1)[:, :, 0] self.prob2 = tf.nn.softmax(self.logits2)[:, :, 0] self.total_loss = self.loss1 + self.loss2 self.graph_built = True
def get_loss_independently(bert_config, input_tensor, masked_lm_positions, masked_lm_weights, loss_base, loss_target): input_tensor = bc.gather_indexes(input_tensor, masked_lm_positions) hidden = bc.dense(bert_config.hidden_size, bc.create_initializer(bert_config.initializer_range), bc.get_activation(bert_config.hidden_act))(input_tensor) def get_regression_and_loss(hidden_vector, loss_label): logits = bc.dense(2, bc.create_initializer( bert_config.initializer_range))(hidden_vector) gold_prob = loss_to_prob_pair(loss_label) logits = tf.reshape(logits, gold_prob.shape) per_example_loss = tf.nn.softmax_cross_entropy_with_logits(gold_prob, logits, axis=-1, name=None) per_example_loss = tf.cast(masked_lm_weights, tf.float32) * per_example_loss losses = tf.reduce_sum(per_example_loss, axis=1) loss = tf.reduce_mean(losses) return loss, per_example_loss, logits loss1, per_example_loss1, logits1 = get_regression_and_loss( hidden, loss_base) loss2, per_example_loss2, logits2 = get_regression_and_loss( hidden, loss_target) prob1 = tf.nn.softmax(logits1)[:, :, 0] prob2 = tf.nn.softmax(logits2)[:, :, 0] total_loss = loss1 + loss2 return total_loss, loss1, loss2, per_example_loss1, per_example_loss2, prob1, prob2