def get_next_sentence_output(bert_config, input_tensor, labels): """Get loss and log probs for the next sentence prediction.""" # Simple binary classification. Note that 0 is "next sentence" and 1 is # "random sentence". This weight matrix is not used after pre-training. with tf.compat.v1.variable_scope("cls/seq_relationship"): output_weights = tf.compat.v1.get_variable( "output_weights", shape=[2, bert_config.hidden_size], initializer=modeling.create_initializer( bert_config.initializer_range)) output_bias = tf.compat.v1.get_variable( "output_bias", shape=[2], initializer=tf.compat.v1.zeros_initializer()) input_tensor = bf.i_cast(input_tensor) output_weights = bf.i_cast(output_weights) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) labels = tf.reshape(labels, [-1]) one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) per_example_loss = -tf.reduce_sum( input_tensor=one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(input_tensor=per_example_loss) return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.compat.v1.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.compat.v1.variable_scope("transform"): input_tensor = tf.compat.v1.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.compat.v1.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.compat.v1.zeros_initializer()) input_tensor = bf.i_cast(input_tensor) output_weights = bf.i_cast(output_weights) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum( input_tensor=log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(input_tensor=label_weights * per_example_loss) denominator = tf.reduce_sum(input_tensor=label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_pooled_output() #hidden_size = output_layer.shape[-1].value # For V2 value does not work hidden_size = output_layer.shape[-1] output_weights = tf.compat.v1.get_variable( "cls/classifier/output_weights", [num_labels, hidden_size], initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.02)) output_bias = tf.compat.v1.get_variable( "cls/classifier/output_bias", [num_labels], initializer=tf.compat.v1.zeros_initializer()) with tf.compat.v1.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, rate=1 - (0.9)) output_layer = bf.i_cast(output_layer) output_weights = bf.i_cast(output_weights) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = bf.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum( input_tensor=one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(input_tensor=per_example_loss) return (loss, per_example_loss, logits, probabilities)