Пример #1
0
    def get_next_sentence_output(self, bert_config, input_tensor, labels):
        """Get loss and log probs for the next sentence prediction."""

        # Simple binary classification. Note that 0 is "next sentence" and 1 is
        # "random sentence". This weight matrix is not used after pre-training.
        with tf.variable_scope("cls/seq_relationship"):
            output_weights = tf.get_variable(
                "output_weights",
                shape=[2, bert_config.hidden_size],
                initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            output_bias = tf.get_variable("output_bias",
                                          shape=[2],
                                          initializer=tf.zeros_initializer())

            logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            # log_probs = tf.nn.log_softmax(logits, axis=-1)
            log_probs = tf.nn.log_softmax(logits)
            labels = tf.reshape(labels, [-1])
            one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            loss = tf.reduce_mean(per_example_loss)
            return (loss, per_example_loss, log_probs)
Пример #2
0
def encode(embedding_output, input_ids, input_mask, token_type_ids, config):
    with tf.variable_scope("bert", reuse=True):
        with tf.variable_scope("embeddings", reuse=True):
            embedding_output = modeling.embedding_postprocessor(
                input_tensor=embedding_output,
                use_token_type=True,
                token_type_ids=token_type_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob)

        with tf.variable_scope("encoder", reuse=True):
            attention_mask = modeling.create_attention_mask_from_input_mask(
                input_ids, input_mask)

            all_encoder_layers, _ = modeling.transformer_model(
                input_tensor=embedding_output,
                attention_mask=attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

        all_encoder_layers = [embedding_output] + all_encoder_layers
        if FLAGS.use_cls_token:
            with tf.variable_scope("pooler", reuse=True):
                first_token_tensor = tf.squeeze(
                    all_encoder_layers[-1][:, 0:1, :], 1)
                pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))
        else:
            sequence_output = all_encoder_layers[FLAGS.low_layer_idx]
            pooled_output = mean_pool(sequence_output, input_mask)
    return pooled_output
Пример #3
0
    def get_masked_lm_output(self, bert_config, input_tensor, output_weights,
                             positions, label_ids, label_weights):
        """Get loss and log probs for the masked LM."""
        input_tensor = gather_indexes(input_tensor, positions)

        with tf.variable_scope("cls/predictions"):
            # We apply one more non-linear transformation before the output layer.
            # This matrix is not used after pre-training.
            with tf.variable_scope("transform"):
                input_tensor = tf.layers.dense(
                    input_tensor,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        bert_config.initializer_range))
                input_tensor = modeling.layer_norm(input_tensor)

            # The output weights are the same as the input embeddings, but there is
            # an output-only bias for each token.
            output_bias = tf.get_variable("output_bias",
                                          shape=[bert_config.vocab_size],
                                          initializer=tf.zeros_initializer())
            logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            # log_probs = tf.nn.log_softmax(logits, axis=-1)
            log_probs = tf.nn.log_softmax(logits)

            label_ids = tf.reshape(label_ids, [-1])
            label_weights = tf.reshape(label_weights, [-1])

            one_hot_labels = tf.one_hot(label_ids,
                                        depth=bert_config.vocab_size,
                                        dtype=tf.float32)

            # The `positions` tensor might be zero-padded (if the sequence is too
            # short to have the maximum number of predictions). The `label_weights`
            # tensor has a value of 1.0 for every real prediction and 0.0 for the
            # padding predictions.
            per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                              axis=[-1])
            numerator = tf.reduce_sum(label_weights * per_example_loss)
            denominator = tf.reduce_sum(label_weights) + 1e-5
            loss = numerator / denominator

        return (loss, per_example_loss, log_probs)