예제 #1
0
def get_diff_loss(bert_config, input_tensor, masked_lm_positions,
                  masked_lm_weights, loss_base, loss_target):
    base_prob = tf.exp(-loss_base)
    target_prob = tf.exp(-loss_target)

    prob_diff = base_prob - target_prob

    input_tensor = bc.gather_indexes(input_tensor, masked_lm_positions)
    with tf.compat.v1.variable_scope("diff_loss"):

        hidden = bc.dense(bert_config.hidden_size,
                          bc.create_initializer(bert_config.initializer_range),
                          bc.get_activation(
                              bert_config.hidden_act))(input_tensor)

        logits = bc.dense(1,
                          bc.create_initializer(
                              bert_config.initializer_range))(hidden)
        logits = tf.reshape(logits, prob_diff.shape)

    per_example_loss = tf.abs(prob_diff - logits)
    per_example_loss = tf.cast(masked_lm_weights,
                               tf.float32) * per_example_loss
    losses = tf.reduce_sum(per_example_loss, axis=1)
    loss = tf.reduce_mean(losses)

    return loss, per_example_loss, logits
예제 #2
0
def get_masked_lm_output_albert(model_config, input_tensor, output_weights,
                                positions, label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = bert_common.gather_indexes(input_tensor, positions)

    with tf.compat.v1.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.compat.v1.variable_scope("transform"):
            input_tensor = tf.keras.layers.Dense(
                model_config.embedding_size,
                activation=bert_common.get_activation(model_config.hidden_act),
                kernel_initializer=bert_common.create_initializer(
                    model_config.initializer_range))(input_tensor)
            input_tensor = bert_common.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.compat.v1.get_variable(
            "output_bias",
            shape=[model_config.vocab_size],
            initializer=tf.compat.v1.zeros_initializer())
        print("output_weights", output_weights.shape)
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=model_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(
            input_tensor=log_probs * one_hot_labels, axis=[-1])
        numerator = tf.reduce_sum(input_tensor=label_weights *
                                  per_example_loss)
        denominator = tf.reduce_sum(input_tensor=label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
예제 #3
0
    def train_modeling(self, input_tensor, masked_lm_positions,
                       masked_lm_weights, loss_base, loss_target):
        if self.graph_built:
            raise Exception()
        batch_size, _, hidden_dims = get_shape_list(input_tensor)
        input_tensor = bc.gather_indexes(input_tensor, masked_lm_positions)
        input_tensor = tf.reshape(input_tensor, [batch_size, -1, hidden_dims])
        with tf.compat.v1.variable_scope("project"):
            hidden = self.layer1(input_tensor)

        def cross_entropy(logits, loss_label):
            gold_prob = loss_to_prob_pair(loss_label)
            logits = tf.reshape(logits, gold_prob.shape)

            per_example_loss = tf.nn.softmax_cross_entropy_with_logits(
                gold_prob, logits, axis=-1, name=None)
            per_example_loss = tf.cast(masked_lm_weights,
                                       tf.float32) * per_example_loss
            losses = tf.reduce_sum(per_example_loss, axis=1)
            loss = tf.reduce_mean(losses)
            return loss, per_example_loss

        with tf.compat.v1.variable_scope("cls1"):
            self.logits1 = self.logit_dense1(hidden)
        with tf.compat.v1.variable_scope("cls2"):
            self.logits2 = self.logit_dense2(hidden)

        self.loss1, self.per_example_loss1 = cross_entropy(
            self.logits1, loss_base)
        self.loss2, self.per_example_loss2 = cross_entropy(
            self.logits2, loss_target)

        self.prob1 = tf.nn.softmax(self.logits1)[:, :, 0]
        self.prob2 = tf.nn.softmax(self.logits2)[:, :, 0]

        self.total_loss = self.loss1 + self.loss2
        self.graph_built = True
예제 #4
0
def get_loss_independently(bert_config, input_tensor, masked_lm_positions,
                           masked_lm_weights, loss_base, loss_target):
    input_tensor = bc.gather_indexes(input_tensor, masked_lm_positions)

    hidden = bc.dense(bert_config.hidden_size,
                      bc.create_initializer(bert_config.initializer_range),
                      bc.get_activation(bert_config.hidden_act))(input_tensor)

    def get_regression_and_loss(hidden_vector, loss_label):
        logits = bc.dense(2,
                          bc.create_initializer(
                              bert_config.initializer_range))(hidden_vector)
        gold_prob = loss_to_prob_pair(loss_label)
        logits = tf.reshape(logits, gold_prob.shape)

        per_example_loss = tf.nn.softmax_cross_entropy_with_logits(gold_prob,
                                                                   logits,
                                                                   axis=-1,
                                                                   name=None)
        per_example_loss = tf.cast(masked_lm_weights,
                                   tf.float32) * per_example_loss
        losses = tf.reduce_sum(per_example_loss, axis=1)
        loss = tf.reduce_mean(losses)

        return loss, per_example_loss, logits

    loss1, per_example_loss1, logits1 = get_regression_and_loss(
        hidden, loss_base)
    loss2, per_example_loss2, logits2 = get_regression_and_loss(
        hidden, loss_target)

    prob1 = tf.nn.softmax(logits1)[:, :, 0]
    prob2 = tf.nn.softmax(logits2)[:, :, 0]

    total_loss = loss1 + loss2
    return total_loss, loss1, loss2, per_example_loss1, per_example_loss2, prob1, prob2