def get_next_sentence_output(bert_config, input_tensor, labels, label_weights=None): """Get loss and log probs for the next sentence prediction.""" # Simple binary classification. Note that 0 is "next sentence" and 1 is # "random sentence". This weight matrix is not used after pre-training. num_labels = FLAGS.num_cls_labels with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights_" + str(num_labels), shape=[num_labels, bert_config.hidden_size], initializer=modeling.create_initializer( bert_config.initializer_range)) output_bias = tf.get_variable("output_bias_" + str(num_labels), shape=[num_labels], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) labels = tf.reshape(labels, [-1]) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) if label_weights is not None: numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator else: loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias_pretrained = tf.get_variable( "output_bias", shape=[bert_config.pretrained_vocab_size], initializer=tf.zeros_initializer()) if bert_config.vocab_size == bert_config.pretrained_vocab_size: output_bias = output_bias_pretrained elif bert_config.vocab_size > bert_config.pretrained_vocab_size: output_bias_entities = tf.get_variable( "output_bias_entities_0", shape=[ bert_config.vocab_size - bert_config.pretrained_vocab_size ], initializer=tf.zeros_initializer()) output_bias = tf.concat( [output_bias_pretrained, output_bias_entities], 0) else: # pretrained_vocab_size > vocab_size raise Exception( "Pretrained vocab cannot be larger than actual vocab!") logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)