def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
  """Get loss and log probs for the masked LM."""
  input_tensor = gather_indexes(input_tensor, positions)

  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[bert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    label_ids = tf.reshape(label_ids, [-1])
    label_weights = tf.reshape(label_weights, [-1])

    one_hot_labels = tf.one_hot(
        label_ids, depth=bert_config.vocab_size, dtype=tf.float32)

    # The `positions` tensor might be zero-padded (if the sequence is too
    # short to have the maximum number of predictions). The `label_weights`
    # tensor has a value of 1.0 for every real prediction and 0.0 for the
    # padding predictions.
    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
    numerator = tf.reduce_sum(label_weights * per_example_loss)
    denominator = tf.reduce_sum(label_weights) + 1e-5
    loss = numerator / denominator

  return (loss, per_example_loss, log_probs)
Exemplo n.º 2
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
  """Get loss and log probs for the masked LM."""
  input_tensor = gather_indexes(input_tensor, positions)

  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[bert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    label_ids = tf.reshape(label_ids, [-1])
    label_weights = tf.reshape(label_weights, [-1])

    one_hot_labels = tf.one_hot(
        label_ids, depth=bert_config.vocab_size, dtype=tf.float32)

    # The `positions` tensor might be zero-padded (if the sequence is too
    # short to have the maximum number of predictions). The `label_weights`
    # tensor has a value of 1.0 for every real prediction and 0.0 for the
    # padding predictions.
    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
    numerator = tf.reduce_sum(label_weights * per_example_loss)
    denominator = tf.reduce_sum(label_weights) + 1e-5
    loss = numerator / denominator

  return (loss, per_example_loss, log_probs)
Exemplo n.º 3
0
def get_classification_loss(model_config, pool_output, class_label, n_class):
    with tf.variable_scope("cls/seq_relationship"):
        output_weights = tf.get_variable(
            "output_weights",
            shape=[n_class, model_config.hidden_size],
            initializer=modeling.create_initializer(
                model_config.initializer_range))
        output_bias = tf.get_variable("output_bias",
                                      shape=[n_class],
                                      initializer=tf.zeros_initializer())

        logits = tf.matmul(pool_output, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        labels = tf.reshape(class_label, [-1])
        one_hot_labels = tf.one_hot(labels, depth=n_class, dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return loss, per_example_loss, log_probs
Exemplo n.º 4
0
    def __call__(self, x, y, sequence_length):
        x = tf.reshape(x, (-1, self.hidden_dim))

        self.logits = tf.layers.dense(x, self.output_shape,
                                      activation=self.activation,
                                      kernel_initializer=modeling.create_initializer(self.initializer_range))
        self.targets = tf.to_float(y)
        self.preds = tf.reshape(self.logits, [-1, self.max_length])
        istarget = tf.to_float(tf.not_equal(self.targets, 0))
        self.accuracy = tf.reduce_sum(tf.to_float(tf.square(tf.subtract(self.preds, self.targets))) * istarget) / (
            tf.reduce_sum(istarget))

        istargetv2 = tf.to_float(sequence_length)
        self.accuracy2 = tf.reduce_sum(tf.to_float(tf.square(tf.subtract(self.preds, self.targets))) * istargetv2) / (
            tf.reduce_sum(istargetv2))

        self.loss = self.loss_layer(self.logits, self.targets, sequence_length)
        # self.loss = self.crf_loss_layer(self.logits, self.targets, sequence_length)
        return
Exemplo n.º 5
0
def get_mlm_output(input_tensor, albert_config, mlm_positions, output_weights,
                   label_ids, label_weights):
    """From run_pretraining.py."""
    input_tensor = gather_indexes(input_tensor, mlm_positions)
    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=albert_config.embedding_size,
                activation=modeling.get_activation(albert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    albert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[albert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [1, -1])
        one_hot_labels = tf.one_hot(label_ids,
                                    depth=albert_config.vocab_size,
                                    dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

        masked_lm_log_probs = tf.reshape(log_probs, [-1, log_probs.shape[-1]])
        masked_lm_predictions = tf.argmax(masked_lm_log_probs,
                                          axis=-1,
                                          output_type=tf.int32)
        # return masked_lm_predictions
        return loss, per_example_loss
Exemplo n.º 6
0
def get_next_sentence_output(bert_config, input_tensor, labels):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, bert_config.hidden_size],
        initializer=modeling.create_initializer(bert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    labels = tf.reshape(labels, [-1])
    one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, per_example_loss, log_probs)
Exemplo n.º 7
0
def get_next_sentence_output(bert_config, input_tensor, labels):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, bert_config.hidden_size],
        initializer=modeling.create_initializer(bert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    labels = tf.reshape(labels, [-1])
    one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, per_example_loss, log_probs)
def get_next_sentence_output(bert_config, input_tensor, labels, num_classes):
    """Get loss and log probs for the next sentence prediction."""
    with tf.variable_scope('cls/seq_relationship'):
        output_weights = tf.get_variable(
            'output_weights',
            shape=[num_classes, bert_config.hidden_size],
            initializer=modeling.create_initializer(
                bert_config.initializer_range))
        output_bias = tf.get_variable('output_bias',
                                      shape=[num_classes],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        labels = tf.reshape(labels, [-1])
        one_hot_labels = tf.one_hot(labels,
                                    depth=num_classes,
                                    dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, per_example_loss, log_probs)
    def __init__(self, config, tf_dtype, input_hidden, embedding_table):
        # Keep variable names the same as BERT
        with tf.variable_scope("cls"):
            with tf.variable_scope("predictions"):
                with tf.variable_scope("transform"):
                    self.transformed_output = tf.layers.dense(
                        input_hidden,
                        config.hidden_size,
                        activation=modeling.get_activation(config.hidden_act),
                        kernel_initializer=modeling.create_initializer(
                            config.initializer_range))
                    self.transformed_output = modeling.layer_norm(
                        self.transformed_output)

                output_bias = tf.Variable(tf.zeros([config.vocab_size],
                                                   dtype=tf_dtype),
                                          name="output_bias",
                                          dtype=tf_dtype)
                self.final_output = tf.add(
                    tf.matmul(self.transformed_output,
                              tf.transpose(embedding_table)), output_bias)
                self.probs = tf.nn.softmax(self.final_output,
                                           name='token_probs')
Exemplo n.º 10
0
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights):
    """From run_pretraining.py."""
    input_tensor = gather_indexes(input_tensor, mlm_positions)
    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=albert_config.embedding_size,
                activation=modeling.get_activation(albert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    albert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[albert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
    return logits
Exemplo n.º 11
0
def get_shuffle_loss(model_config, seq_output, label_ids, label_weights):
    sequence_shape = modeling.get_shape_list(seq_output, expected_rank=[3])
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    seq_output = tf.reshape(seq_output, [-1, width])
    with tf.variable_scope("cls/shuffle"):
        with tf.variable_scope("transform"):
            seq_output = tf.layers.dense(
                seq_output,
                units=seq_length,
                activation=modeling.get_activation(model_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    model_config.initializer_range))
            seq_output = modeling.layer_norm(seq_output)

        output_bias = tf.get_variable("output_bias",
                                      shape=[seq_length],
                                      initializer=tf.zeros_initializer())

        logits = tf.nn.bias_add(seq_output, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(tf.cast(label_weights, tf.float32), [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=seq_length,
                                    dtype=tf.float32)

        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return loss, per_example_loss, log_probs
def init(max_sequence_length, bert_config_file, model_path, vocab_file):
    sess = tf.Session()
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
    bert_config = modeling.BertConfig.from_json_file(bert_config_file)

    input_ids = tf.placeholder(tf.int32, shape=[None, max_sequence_length], name='input_ids')
    input_mask = tf.placeholder(tf.int32, shape=[None, max_sequence_length], name='input_mask')
    segment_ids = tf.placeholder(tf.int32, shape=[None, max_sequence_length], name='segment_ids')

    with sess.as_default():
        model = modeling.BertModel(
            config=bert_config,
            is_training=False,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=False)

        output_layer = model.get_pooled_output()

        with tf.variable_scope("cls/seq_relationship"):
            output_weights = tf.get_variable(
                "output_weights",
                shape=[2, bert_config.hidden_size],
                initializer=modeling.create_initializer(bert_config.initializer_range))
            output_bias = tf.get_variable(
                "output_bias", shape=[2], initializer=tf.zeros_initializer())

            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            probs = tf.nn.softmax(logits, axis=-1, name='probs')

        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, model_path)

    return sess, tokenizer
Exemplo n.º 13
0
    def build_model(self):
        from modeling import transformer_model, create_attention_mask_from_input_mask

        if self.is_training:
            dropout_prob = 0.1
        else:
            dropout_prob = 0.0

        attention1_mask = create_attention_mask_from_input_mask(
            self.sent1, self.sent1_mask)
        attention2_mask = create_attention_mask_from_input_mask(
            self.sent2, self.sent2_mask)

        # sent1 = transformer_model(self.sent1, attention1_mask,
        #                           hidden_size=768, num_hidden_layers=1,
        #                           intermediate_size=3072,
        #                           hidden_dropout_prob=dropout_prob,
        #                           attention_probs_dropout_prob=dropout_prob)
        # sent2 = transformer_model(self.sent2, attention2_mask,
        #                           hidden_size=768, num_hidden_layers=1,
        #                           intermediate_size=3072,
        #                           hidden_dropout_prob=dropout_prob,
        #                           attention_probs_dropout_prob=dropout_prob)
        sent1 = self.sent1
        sent2 = self.sent2

        d_vec = self.DCMN(sent1, sent2, self.sent1_mask, self.sent2_mask)

        gate = tf.layers.dense(tf.concat([d_vec, self.mark0], axis=1),
                               768,
                               activation=tf.sigmoid,
                               kernel_initializer=create_initializer(0.02))

        refer_output = self.mark0 * gate + (1 - gate) * d_vec
        tf.keras.layers.BatchNormalization

        return refer_output
Exemplo n.º 14
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

    return log_probs
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]])
        # TODO: dynamic gather from per_example_loss
    return loss
Exemplo n.º 16
0
def get_sentence_direction_output(bert_config, input_tensor, labels):
    """Get loss and log probs for the sentence direction prediction."""

    # Simple trinary classification. Note that
    # forward =1
    # unrelated=2
    # backward=0
    with tf.variable_scope("cls/seq_direction2"):
        output_weights2 = tf.get_variable(
            "output_weights2",
            shape=[3, bert_config.hidden_size],
            initializer=modeling.create_initializer(bert_config.initializer_range))
        output_bias2 = tf.get_variable(
            "output_bias2", shape=[3], initializer=tf.zeros_initializer())

        
        logits = tf.matmul(input_tensor, output_weights2, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias2)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        labels = tf.reshape(labels, [-1])
        one_hot_labels = tf.one_hot(labels, depth=3, dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, per_example_loss, log_probs)
Exemplo n.º 17
0
    def greedy_decode_8steps(
            self,
            cls_vector,  # batch, 1, hid_size
            sequence_output):  # batch, seq_len, hid_size
        hparams = self.hparams

        # When features into self.body() doesn't have 'targets' and 'theorem'
        # then we are in predict/infer mode. Since there is only a small
        # number of unrolling steps for the output, (1 for predicting theorem
        # and another 7 for the theorem premise), we build a static graph
        # to do greedy decode.

        # Here we cache the activations during decoding.
        # for each layer of the decoding transformer, we store
        # a tensor of size [batch, current_length, hidden_dim]
        # at first current_length = 0:
        cached_layers = [
            tf.zeros_like(cls_vector[:, :0, :])  # [batch, 0, hid_size]
            for _ in range(hparams.num_decode_layers)
        ]

        # We also store all the premise prediction into a tensor
        # of shape [batch, current_length]
        premises = tf.zeros_like(
            cls_vector[:, :0, 0],  # [batch, 0]
            dtype=tf.int32)

        # The first token to be processed is the CLS vector.
        decoder_input = cls_vector

        # Now we build the static unrolling of 8-step decoding,
        # each step update a new value for decoder_input
        for count in range(8):
            current_lengths = [
                layer.shape.as_list()[1] for layer in cached_layers
            ]
            assert current_lengths[1:] == current_lengths[:-1]
            current_length = current_lengths[0]
            with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
                # cached_layers will be updated inside this method.
                # Feed this single token into the decoder transformer.
                output_vector = self.one_column_cached_transformer(
                    decoder_input,  # batch, 1, hid_size
                    # list of num_hid_layers tensors, each of shape
                    # [batch, current_length, hidden_size]
                    cached_layers)  # [batch, 1, hid_size]

            # After this step, all tensors in cached_layers
            # increased 1 in length:
            assert cached_layers[0].shape.as_list()[1] == current_length + 1

            # Next the output vector is used to predict theorem
            # if we are at step 0, otherwise predict premise.
            with tf.variable_scope('prediction', reuse=tf.AUTO_REUSE):
                if count == 0:
                    theorem_logits = tf.keras.layers.Dense(  # [batch, 1, num_theorems]
                        name='theorem',
                        units=hparams.num_theorems,
                        use_bias=True,
                        kernel_initializer=modeling.create_initializer(
                            hparams.initializer_range))(output_vector)
                    theorem = tf.argmax(  # [batch, 1]
                        theorem_logits,  # [batch, 1, num_theorems]
                        axis=-1,
                        output_type=tf.int32)
                else:
                    premise_logits = tf.matmul(  # batch, 1, seq_len
                        a=output_vector,  # [batch, 1, hid_size]
                        b=sequence_output,  # [batch, sequence_len, hid_size]
                        transpose_b=True,
                    )  # [batch, 1, sequence_len]
                    premise = tf.argmax(  # [batch, 1]
                        premise_logits,  # [batch, 1, seq_len]
                        axis=-1,
                        output_type=tf.int32)

                    # [batch, current_len + 1]
                    premises = tf.concat([premises, premise], axis=1)

                    # [batch, 1, hid_size]
                    decoder_input = premise_gather_nd(sequence_output, premise)
                    continue

            # For theorem prediction, we need to go back to variable scope
            # decoder/embedding to get the new decoder_input
            with tf.variable_scope('decoder/embeddings', reuse=tf.AUTO_REUSE):
                # [batch, 1, hid_size] and [num_theorems, hid_size]
                # from the theorem_embedding lookup table.
                decoder_input, _ = modeling.embedding_lookup(
                    input_ids=theorem,  # [batch, 1]
                    vocab_size=hparams.num_theorems,
                    embedding_size=hparams.hidden_size,
                    initializer_range=hparams.initializer_range,
                    word_embedding_name='theorem_embedding',
                )

        logits = dict(
            theorem=theorem,  # [batch, 1]
            premises=premises)  # [batch, 7]
        losses = dict(training=tf.constant(0.0))
        return logits, losses
Exemplo n.º 18
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    # input_tensor:[batch_size, seq_length, hidden_size]
    # positions:[batch_size, mask_num]
    # output_weights: [vocab_size, embedding_size]
    # -> input_tensor:[batch_size*mask_num, hidden_size]
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            # 在输出之前添加一个非线性变换,只在预训练阶段起作用
            # new input_tensor:[batch_size*mask_num, hidden_size]
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            # new input_tensor:[batch_size*mask, hidden_size]
            input_tensor = modeling.layer_norm(input_tensor)

        tf.logging.info("input tensor shape after transform:{}".format(
            input_tensor.shape))

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        # output_bias:[vocab_size,]
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())

        # input_tensor:[batch_size*mask_num, hidden_size]
        # output weights: [vocab_size, embedding_size=hidden_size]
        # logits:[batch_size*mask_num, vocab_size]
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        # output_bias:[vocab_size]
        logits = tf.nn.bias_add(logits, output_bias)
        # log_probs:[batch_size*mask_num, vocab_size]
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        #label_ids:[batch_size, mask_num]
        #new label_ids:[batch_size*mask_num, 1]
        label_ids = tf.reshape(label_ids, [-1])
        #new label_weights:[batch_size*mask_num, 1]
        label_weights = tf.reshape(label_weights, [-1])

        # one_hot_labels:[batch_size*mask_num, vocab_size]
        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.

        # log_probs:[batch_size*mask_num, vocab_size]
        # one_hot_labels:[batch_size*mask_num, vocab_size]
        # per_example_loss:[batch_size*mask,]
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])  # cross-entropy loss
        # 乘以样本权重
        #label_weights:[batch_size*mask, 1]
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        # 样本权重归一化后的loss
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Exemplo n.º 19
0
def get_masked_span_output(bert_config, input_tensor, input_mask,  positions, start_labels, end_labels, label_weights):
    """Get loss and log probs for the recurring span masking."""
    sequence_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]
    num_positions = modeling.get_shape_list(positions, expected_rank=2)[1]

    query_tensor = gather_indexes(input_tensor, positions)  # [batch_size * num_positions, width]

    with tf.variable_scope("cls/span_predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("query_start_transform"):
            query_start_tensor = tf.layers.dense(
                query_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            query_start_tensor = modeling.layer_norm(query_start_tensor)

        with tf.variable_scope("query_end_transform"):
            query_end_tensor = tf.layers.dense(
                query_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            query_end_tensor = modeling.layer_norm(query_end_tensor)

        with tf.variable_scope("start_transform"):
            start_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            start_tensor = modeling.layer_norm(start_tensor)

        with tf.variable_scope("end_transform"):
            end_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            end_tensor = modeling.layer_norm(end_tensor)

        start_classifier = tf.get_variable(
            "start_classifier",
            shape=[bert_config.hidden_size, bert_config.hidden_size],
            initializer=modeling.create_initializer(
                    bert_config.initializer_range))

        end_classifier = tf.get_variable(
            "end_classifier",
            shape=[bert_config.hidden_size, bert_config.hidden_size],
            initializer=modeling.create_initializer(
                bert_config.initializer_range))

        input_mask = tf.expand_dims(input_mask, axis=1)  # [batch_size, 1, seq_length]
        adder = (1.0 - tf.cast(input_mask, tf.float32)) * -10000.0

        temp = tf.matmul(query_start_tensor, start_classifier)  # [batch_size * num_positions, width]
        temp = tf.reshape(temp, [batch_size, num_positions, width])  # [batch_size, num_positions, width]
        start_tensor = tf.transpose(start_tensor, perm=[0, 2, 1])  # [batch_size, width, seq_length]
        start_logits = tf.matmul(temp, start_tensor)  # [batch_size, num_positions, seq_length]
        start_logits += adder
        start_logits = tf.reshape(start_logits, [batch_size * num_positions, seq_length])

        temp = tf.matmul(query_end_tensor, end_classifier)  # [batch_size * num_positions, width]
        temp = tf.reshape(temp, [batch_size, num_positions, width])  # [batch_size, num_positions, width]
        end_tensor = tf.transpose(end_tensor, perm=[0, 2, 1])  # [batch_size, width, seq_length]
        end_logits = tf.matmul(temp, end_tensor)  # [batch_size, num_positions, seq_length]
        end_logits += adder
        end_logits = tf.reshape(end_logits, [batch_size * num_positions, seq_length])

        label_weights = tf.reshape(label_weights, [-1])  # [batch_size * num_positions]

        start_log_probs = tf.nn.log_softmax(start_logits, axis=-1) #  [batch_size * num_positions, seq_length]
        start_labels = tf.reshape(start_labels, [-1])  # [batch_size * num_positions]
        start_one_hot_labels = tf.one_hot(
            start_labels, depth=seq_length, dtype=tf.float32)  # # [batch_size * num_positions, seq_length]

        start_per_example_loss = -tf.reduce_sum(start_log_probs * start_one_hot_labels, axis=[-1])

        end_log_probs = tf.nn.log_softmax(end_logits, axis=-1)  # [batch_size * num_positions, seq_length]
        end_labels = tf.reshape(end_labels, [-1])  # [batch_size * num_positions]
        end_one_hot_labels = tf.one_hot(
            end_labels, depth=seq_length, dtype=tf.float32)  # # [batch_size * num_positions, seq_length]

        end_per_example_loss = -tf.reduce_sum(end_log_probs * end_one_hot_labels, axis=[-1])

        per_example_loss = (start_per_example_loss + end_per_example_loss) / 2
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return loss, per_example_loss
Exemplo n.º 20
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        masked_lm_positions = features["masked_lm_positions"]
        masked_lm_ids = features["masked_lm_ids"]
        masked_lm_weights = features["masked_lm_weights"]
        next_sentence_labels = features["next_sentence_labels"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs,
         student_masked_lm_logits) = get_masked_lm_output(
             bert_config, model.get_sequence_output(),
             model.get_embedding_table(), masked_lm_positions, masked_lm_ids,
             masked_lm_weights)

        (next_sentence_loss, next_sentence_example_loss,
         next_sentence_log_probs,
         student_next_sentence_logits) = get_next_sentence_output(
             bert_config, model.get_pooled_output(), next_sentence_labels)

        if FLAGS.distill:
            teacher_config = modeling.BertConfig.from_json_file(
                FLAGS.teacher_config_file)
            with tf.variable_scope("teacher"):
                teacher = modeling.BertModel(
                    config=teacher_config,
                    is_training=False,
                    input_ids=input_ids,
                    input_mask=input_mask,
                    token_type_ids=segment_ids,
                    use_one_hot_embeddings=use_one_hot_embeddings)

            # map every g layers of the teacher model to the student model
            g = int(teacher_config.num_hidden_layers /
                    bert_config.num_hidden_layers)

            # project teacher hidden layers down to student hidden layers dims
            # with tf.variable_scope('loss'):
            teacher_hidden_layers = teacher.get_all_encoder_layers()

            hidden_loss = tf.add_n([
                tf.reduce_sum(
                    tf.squared_difference(
                        tf.layers.dense(
                            teacher_hidden_layers[i * g],
                            units=bert_config.hidden_size,
                            kernel_initializer=modeling.create_initializer(
                                bert_config.initializer_range)),
                        student_hidden)) for i, student_hidden in enumerate(
                            model.get_all_encoder_layers())
            ])

            hidden_loss_same_size = tf.add_n([
                tf.reduce_sum(
                    tf.squared_difference(teacher_hidden_layers[i * g],
                                          student_hidden)) for i,
                student_hidden in enumerate(model.get_all_encoder_layers())
            ])

            embedding_loss = tf.reduce_mean(
                tf.squared_difference(teacher.get_embedding_output(),
                                      model.get_embedding_output()))

            attention_loss = tf.add_n([
                tf.reduce_sum(
                    tf.squared_difference(teacher.attention_scores[i * g],
                                          student_scores))
                for i, student_scores in enumerate(model.attention_scores)
            ])

            if FLAGS.pred_distill:
                with tf.variable_scope('teacher'):
                    (teacher_masked_lm_loss, teacher_masked_lm_example_loss,
                     teacher_masked_lm_log_probs,
                     teacher_masked_lm_logits) = get_masked_lm_output(
                         teacher_config, teacher.get_sequence_output(),
                         teacher.get_embedding_table(), masked_lm_positions,
                         masked_lm_ids, masked_lm_weights)

                    (teacher_next_sentence_loss,
                     teacher_next_sentence_example_loss,
                     teacher_next_sentence_log_probs,
                     teacher_next_sentence_logits) = get_next_sentence_output(
                         teacher_config, teacher.get_pooled_output(),
                         next_sentence_labels)

                    masked_lm_distill_loss = tf.reduce_mean(
                        -tf.nn.softmax(teacher_masked_lm_logits) *
                        tf.nn.log_softmax(student_masked_lm_logits))

                    next_sentence_distill_loss = tf.reduce_mean(
                        tf.squared_difference(teacher_next_sentence_logits,
                                              student_next_sentence_logits))
                    total_loss = masked_lm_distill_loss
            else:
                total_loss = hidden_loss_same_size + embedding_loss + attention_loss + masked_lm_loss
        else:
            total_loss = masked_lm_loss + next_sentence_loss

        tvars = tf.trainable_variables()

        scaffold_fn = None
        checkpoints = []
        assignment_maps = []
        student_variable_names = {}
        teacher_variable_names = {}

        assert FLAGS.teacher_checkpoint or not FLAGS.distill

        if init_checkpoint:
            (assignment_map, student_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            checkpoints.append(init_checkpoint)

            assignment_maps.append(assignment_map)
        if FLAGS.teacher_checkpoint:
            (assignment_map, teacher_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, FLAGS.teacher_checkpoint, teacher=True)
            checkpoints.append(FLAGS.teacher_checkpoint)
            assignment_maps.append(assignment_map)

        if use_tpu:

            def tpu_scaffold():
                for c, a in zip(checkpoints, assignment_maps):
                    tf.logging.info("*** Loading vars from Checkpoint %s ***" %
                                    c)
                    tf.train.init_from_checkpoint(c, a)
                return tf.train.Scaffold()

            scaffold_fn = tpu_scaffold
        else:
            for c, a in zip(checkpoints, assignment_maps):
                tf.logging.info("*** Loading vars from Checkpoint %s ***" % c)
                tf.train.init_from_checkpoint(c, a)

        output_spec = None
        var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'bert/')
        var_list += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      'loss/')
        tf.logging.info("**** Trainable Variables ****")
        for var in var_list:
            tf.logging.info("name = %s, shape = %s", var.name, var.shape)

        if mode == tf.estimator.ModeKeys.TRAIN:
            train_op = optimization.create_optimizer(total_loss, learning_rate,
                                                     num_train_steps,
                                                     num_warmup_steps, use_tpu,
                                                     var_list)

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold_fn=scaffold_fn)

        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(masked_lm_example_loss, masked_lm_log_probs,
                          masked_lm_ids, masked_lm_weights,
                          next_sentence_example_loss, next_sentence_log_probs,
                          next_sentence_labels):
                """Computes the loss and accuracy of the model."""
                masked_lm_log_probs = tf.reshape(
                    masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]])
                masked_lm_predictions = tf.argmax(masked_lm_log_probs,
                                                  axis=-1,
                                                  output_type=tf.int32)
                masked_lm_example_loss = tf.reshape(masked_lm_example_loss,
                                                    [-1])
                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
                masked_lm_accuracy = tf.metrics.accuracy(
                    labels=masked_lm_ids,
                    predictions=masked_lm_predictions,
                    weights=masked_lm_weights)
                masked_lm_mean_loss = tf.metrics.mean(
                    values=masked_lm_example_loss, weights=masked_lm_weights)

                next_sentence_log_probs = tf.reshape(
                    next_sentence_log_probs,
                    [-1, next_sentence_log_probs.shape[-1]])
                next_sentence_predictions = tf.argmax(next_sentence_log_probs,
                                                      axis=-1,
                                                      output_type=tf.int32)
                next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
                next_sentence_accuracy = tf.metrics.accuracy(
                    labels=next_sentence_labels,
                    predictions=next_sentence_predictions)
                next_sentence_mean_loss = tf.metrics.mean(
                    values=next_sentence_example_loss)

                return {
                    "masked_lm_accuracy": masked_lm_accuracy,
                    "masked_lm_loss": masked_lm_mean_loss,
                    "next_sentence_accuracy": next_sentence_accuracy,
                    "next_sentence_loss": next_sentence_mean_loss,
                }

            if FLAGS.eval_teacher:
                eval_metrics = (metric_fn, [
                    teacher_masked_lm_example_loss,
                    teacher_masked_lm_log_probs, masked_lm_ids,
                    masked_lm_weights, teacher_next_sentence_example_loss,
                    teacher_next_sentence_log_probs, next_sentence_labels
                ])
            else:
                eval_metrics = (metric_fn, [
                    masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
                    masked_lm_weights, next_sentence_example_loss,
                    next_sentence_log_probs, next_sentence_labels
                ])
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=eval_metrics,
                scaffold_fn=scaffold_fn)
        else:
            raise ValueError("Only TRAIN and EVAL modes are supported: %s" %
                             (mode))

        return output_spec
def create_model(bert_config, is_training, input_ids1, input_mask1,
                 segment_ids1, input_ids2, input_mask2, segment_ids2, labels,
                 num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model1 = modeling.BertModel(config=bert_config,
                                is_training=is_training,
                                input_ids=input_ids1,
                                input_mask=input_mask1,
                                token_type_ids=segment_ids1,
                                use_one_hot_embeddings=use_one_hot_embeddings)
    sequence_output1 = model1.get_sequence_output()

    model2 = modeling.BertModel(config=bert_config,
                                is_training=is_training,
                                input_ids=input_ids2,
                                input_mask=input_mask2,
                                token_type_ids=segment_ids2,
                                use_one_hot_embeddings=use_one_hot_embeddings)
    sequence_output2 = model2.get_sequence_output()

    print("sequence_output1:{}".format(sequence_output1.shape))
    print("sequence_output2:{}".format(sequence_output2.shape))

    with tf.variable_scope('ESIM'):
        # 计算a_bar与b_bar每个词语之间的相似度
        input_mask1 = tf.cast(input_mask1, tf.float32)
        input_mask2 = tf.cast(input_mask2, tf.float32)
        with tf.variable_scope('local_inference'):
            # attention_weight: [batch_size, seq_length1, seq_length2]
            attention_weight = tf.matmul(
                sequence_output1, tf.transpose(sequence_output2, [0, 2, 1]))

            # attention_weight_2: [batch_size, seq_length1, seq_length2]
            attention_weight_2 = tf.exp(
                attention_weight -
                tf.reduce_max(attention_weight, axis=2, keepdims=True))
            attention_weight_2 = attention_weight_2 * tf.expand_dims(
                tf.cast(input_mask2, tf.float32), 1)
            # alpha: [batch_size,  seq_length1, seq_length2]
            alpha = attention_weight_2 / (
                tf.reduce_sum(attention_weight_2, -1, keepdims=True) + 1e-8)
            # sequence_output1_dual: [batch_size, seq_length1, hidden_size]
            sequence_output1_dual = tf.reduce_sum(
                tf.expand_dims(sequence_output2, 1) *
                tf.expand_dims(alpha, -1), 2)
            print("sequence_output1_dual:{}".format(
                sequence_output1_dual.shape))

            sequence_output1_match = tf.concat([
                sequence_output1, sequence_output1_dual, sequence_output1 *
                sequence_output1_dual, sequence_output1 - sequence_output1_dual
            ], 2)
            print("sequence_output1_match:{}".format(
                sequence_output1_match.shape))

            # attention_weight_1: [batch_size, seq_length, seq_length]
            attention_weight_1 = attention_weight - tf.reduce_max(
                attention_weight, axis=1, keepdims=True)
            attention_weight_1 = tf.exp(
                tf.transpose(attention_weight_1, [0, 2, 1]))
            attention_weight_1 = attention_weight_1 * tf.expand_dims(
                tf.cast(input_mask1, tf.float32), 1)

            # beta: [batch_size, seq_length, seq_length]
            beta = attention_weight_1 / (
                tf.reduce_sum(attention_weight_1, -1, keepdims=True) + 1e-8)
            # sequence_output2_dual: [batch_size, seq_length, hidden_size]
            sequence_output2_dual = tf.reduce_sum(
                tf.expand_dims(sequence_output1, 1) * tf.expand_dims(beta, -1),
                2)
            print("sequence_output2_dual:{}".format(
                sequence_output2_dual.shape))

            sequence_output2_match = tf.concat([
                sequence_output2, sequence_output2_dual, sequence_output2 *
                sequence_output2_dual, sequence_output2 - sequence_output2_dual
            ], 2)
            print("sequence_output2_match:{}".format(
                sequence_output2_match.shape))

        # high dimension to low dimension
        with tf.variable_scope("projection", reuse=tf.AUTO_REUSE):
            output_layer1 = tf.layers.dense(
                sequence_output1_match,
                bert_config.hidden_size,
                name='dense',
                activation=tf.nn.tanh,
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))

            output_layer1 = modeling.layer_norm(output_layer1,
                                                name="layer_norm")
            print("output_layer1:{}".format(output_layer1.shape))

            output_layer2 = tf.layers.dense(
                sequence_output2_match,
                bert_config.hidden_size,
                name='dense',
                reuse=True,
                activation=tf.nn.tanh,
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))

            output_layer2 = modeling.layer_norm(output_layer2,
                                                name="layer_norm")
            print("output_layer2:{}".format(output_layer2.shape))

        if is_training:
            output_layer1 = tf.nn.dropout(output_layer1, keep_prob=0.9)
            output_layer2 = tf.nn.dropout(output_layer2, keep_prob=0.9)

        with tf.variable_scope("composition", reuse=tf.AUTO_REUSE):
            # output_layer1 = tf.reduce_sum(output_layer1 * tf.expand_dims(tf.cast(input_mask1, tf.float32), -1),
            #                               1) / tf.expand_dims(tf.reduce_sum(tf.cast(input_mask1, tf.float32), 1), 1)
            logit_x1_sum = tf.reduce_sum(output_layer1 * tf.expand_dims(input_mask1, -1), 1) / \
                           tf.expand_dims(tf.reduce_sum(input_mask1, 1), 1)
            logit_x1_max = tf.reduce_max(
                output_layer1 * tf.expand_dims(input_mask1, -1), 1)
            logit_x2_sum = tf.reduce_sum(output_layer2 * tf.expand_dims(input_mask2, -1), 1) / \
                           tf.expand_dims(tf.reduce_sum(input_mask2, 1), 1)
            logit_x2_max = tf.reduce_max(
                output_layer2 * tf.expand_dims(input_mask2, -1), 1)

            logit = tf.concat(
                [logit_x1_sum, logit_x1_max, logit_x2_sum, logit_x2_max], 1)
            print("logit:{}".format(logit.shape))
        """
        一下 接双输出,相互影响
        """
        # with tf.variable_scope("output1"):
        #     output_layer1 = tf.reduce_sum(output_layer1 * tf.expand_dims(tf.cast(input_mask1, tf.float32), -1),
        #                                   1) / tf.expand_dims(tf.reduce_sum(tf.cast(input_mask1, tf.float32), 1), 1)
        #
        #     output_weights1 = tf.get_variable(
        #         "finetune_weights", [bert_config.hidden_size, num_labels],
        #         initializer=tf.truncated_normal_initializer(stddev=0.02))
        #
        #     output_bias1 = tf.get_variable(
        #         "finetune_bias", [num_labels], initializer=tf.zeros_initializer())
        #
        #     logits1 = tf.matmul(output_layer1, output_weights1)
        #     logits1 = tf.nn.bias_add(logits1, output_bias1)
        #     probabilities1 = tf.nn.sigmoid(logits1)
        #
        # with tf.variable_scope("output2"):
        #     output_layer2 = tf.reduce_sum(output_layer2 * tf.expand_dims(tf.cast(input_mask2, tf.float32), -1),
        #                                   1) / tf.expand_dims(tf.reduce_sum(tf.cast(input_mask2, tf.float32), 1), 1)
        #
        #     output_weights2 = tf.get_variable(
        #         "finetune_weights", [bert_config.hidden_size, num_labels],
        #         initializer=tf.truncated_normal_initializer(stddev=0.02))
        #
        #     output_bias2 = tf.get_variable(
        #         "finetune_bias", [num_labels], initializer=tf.zeros_initializer())
        #
        #     logits2 = tf.matmul(output_layer2, output_weights2)
        #     logits2 = tf.nn.bias_add(logits2, output_bias2)
        #     probabilities2 = tf.nn.sigmoid(logits2)
    logit = tf.layers.dense(logit,
                            bert_config.hidden_size,
                            name='dense',
                            activation=tf.nn.tanh,
                            kernel_initializer=modeling.create_initializer(
                                bert_config.initializer_range))
    logit = modeling.layer_norm(logit, name="layer_norm")
    print("logit:{}".format(logit.shape))
    if is_training:
        logit = tf.nn.dropout(logit, keep_prob=0.9)

    hidden_size = logit.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(logit, keep_prob=0.9)

        logits = tf.matmul(logit, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        probabilities = tf.nn.softmax(logits, axis=-1)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits, probabilities)
Exemplo n.º 22
0
def create_model(
    config,
    is_training,
    input_ids,
    input_mask,
    segment_ids,
    labels,
    num_labels,
    use_one_hot_embeddings,
    task_name,
):
    """Creates a classification model from_scratch."""
    _true_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), dtype=tf.int32)

    with tf.variable_scope("baseline"):
        with tf.variable_scope("embeddings"):
            # Perform embedding lookup on the word ids.
            (word_embedding_output,
             output_embedding_table) = modeling.embedding_lookup(
                 input_ids=input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.embedding_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            # Add positional embeddings and token type embeddings, then layer
            # normalize and perform dropout.
            embedding_output = modeling.embedding_postprocessor(
                input_tensor=word_embedding_output,
                use_token_type=True,
                token_type_ids=segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob)
    with tf.variable_scope("bilstm"):
        sequence_output = modeling.bilstm_fused(
            inputs=embedding_output,
            sequence_lengths=_true_length,
            lstm_size=config.lstm_size,
            bilstm_dropout_rate=config.bilstm_dropout_rate,
            is_training=is_training,
            num_layers=config.num_bilstm)

    # first_token_tensor = tf.squeeze(sequence_output[:, -1:, :], axis=1)
    last_token_tensor = tf.squeeze(sequence_output[:, -1:, :], axis=1)
    output_layer = tf.layers.dense(
        last_token_tensor,
        config.hidden_size,
        activation=tf.tanh,
        kernel_initializer=modeling.create_initializer(
            config.initializer_range))

    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        if task_name != "sts-b":
            probabilities = tf.nn.softmax(logits, axis=-1)
            predictions = tf.argmax(probabilities,
                                    axis=-1,
                                    output_type=tf.int32)
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(labels,
                                        depth=num_labels,
                                        dtype=tf.float32)

            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
        else:
            probabilities = logits
            logits = tf.squeeze(logits, [-1])
            predictions = logits
            per_example_loss = tf.square(logits - labels)
        loss = tf.reduce_mean(per_example_loss)

        return (loss, per_example_loss, probabilities, logits, predictions)
    def add_embeddings(self):
        with tf.name_scope("embedding"):
            if self.is_Embedding_Needed:
                W = tf.Variable(np.array(self.embeddings),
                                name="word_embed",
                                dtype="float32",
                                trainable=self.trainable)
            else:
                W = tf.get_variable(
                    name='word_embed',
                    shape=[self.vocab_size, self.embedding_size],
                    initializer=modeling.create_initializer(0.02),
                    trainable=True)

            if 'adding_problem' not in self.dataset:
                self.embedding_W = W
                self.embedded_chars_q = tf.nn.embedding_lookup(
                    self.embedding_W, self.question)
            else:
                #mapping 2 dim into high dim
                if self.embedding_size == 2:
                    self.embedded_chars_q = self.question
                else:
                    self.embedded_chars_q = tf.layers.dense(
                        self.question, self.embedding_size)
            print('embedded_chars_q:', self.embedded_chars_q)

            if 'adding_problem' not in self.dataset:
                self.embedded_chars_q = modeling.layer_norm(
                    tf.nn.dropout(self.embedded_chars_q,
                                  keep_prob=1.0 - self.input_dropout_prob))

        context_position = tf.range(self.max_input_left, dtype=tf.int32)[:,
                                                                         None]
        memory_postion = tf.range(self.max_input_left, dtype=tf.int32)[None, :]
        relative_position = memory_postion - context_position

        rp_bucket = relative_position_bucket(relative_position,
                                             num_buckets=self.t5_bucket,
                                             max_distance=self.t5_max_distance)

        #why this embedding is very sensitive...
        self.t5_pos_embedding = tf.get_variable(
            't5_pos_mat', [self.t5_bucket, self.config.num_attention_heads],
            initializer=modeling.create_initializer(0.02),
            trainable=True)

        self.single_t5_att_bias = compute_bias(rp_bucket,
                                               self.t5_pos_embedding)
        ## [batch, num_heads, query_length, memory_length]
        self.t5_att_bias = tf.tile(self.single_t5_att_bias,
                                   [tf.shape(self.question)[0], 1, 1, 1])
        print('t5_bias:', self.t5_att_bias)
        '''
    @2020/9/7 we can directly add the head mask during inference
    '''

        head_mask = np.zeros((self.config.num_attention_heads,
                              self.max_input_left, self.max_input_left))
        #high2low=[3,1,4,0,5,2]
        low2high = [2, 5, 0, 4, 1, 3]
        for tt in range(6):
            print('tt:', tt)
            head_mask[low2high[tt], :, :] = np.ones(
                (self.max_input_left, self.max_input_left))

        self.t5_att_bias = self.t5_att_bias * tf.constant(
            head_mask, tf.float32)
Exemplo n.º 24
0
    def body(self, features):
        hparams = self.hparams
        if not self.is_training:
            hparams.dropout_prob = 0.0

        with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
            # attention_weights: [batch, n_head, from_len, to_len]
            sequence_output, cls_vector, attention_weights = self.build_encoder(
                features)

        if 'targets' not in features:
            assert self.hparams.dropout_prob == 0.0
            logits, losses = self.greedy_decode_8steps(cls_vector,
                                                       sequence_output)
            logits.update(attention_weights=attention_weights[:, :, 0, :])
            return logits, losses

        with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
            with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE):
                premise = features[
                    'targets']  # [batch, premise_len=8] -bad naming:(
                # [batch, premise_len, hid_size]
                premise_vecs = premise_gather_nd(sequence_output, premise)

                batch_size = tf.shape(premise)[0]
                premise_len = premise.shape.as_list()[-1]
                theorem = features['theorem']  # batch, 1

                # [batch, 1, hid_size] and [num_theorems, hid_size]
                theorem_vec, theorem_emb_table = modeling.embedding_lookup(
                    input_ids=theorem,  # [batch, 1]
                    vocab_size=hparams.num_theorems,
                    embedding_size=hparams.hidden_size,
                    initializer_range=hparams.initializer_range,
                    word_embedding_name='theorem_embedding',
                )
                depth = features['depth']  # batch, 1

                decoder_input = tf.concat(
                    [
                        cls_vector,  # [batch, 1, hid_size]
                        theorem_vec,  # [batch, 1, hid_size]
                        premise_vecs[:, :
                                     -1, :]  # [batch, premise_len-1, hid_size]
                    ],
                    axis=1)  # [batch, premise_len + 1, hid_size]
                decode_length = decoder_input.shape.as_list()[1]
                assert decode_length == premise_len + 1

                # [decode_length, hid_size]
                pos_embedding, _ = modeling.embedding_lookup(
                    input_ids=tf.range(decode_length),  # [decode_length]
                    vocab_size=hparams.max_premise,  # >= premise_len
                    embedding_size=hparams.hidden_size,
                    initializer_range=hparams.initializer_range,
                    word_embedding_name='positional_embedding',
                )
                pos_embedding = tf.reshape(
                    pos_embedding, [1, decode_length, hparams.hidden_size])

                decoder_input = modeling.layer_norm_and_dropout(
                    decoder_input +  # [batch, decode_length, hid_size]
                    pos_embedding,  # [1,     decode_length, hid_size]
                    hparams.dropout_prob)  # [batch, decode_length, hid_size]

            with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE):
                causal_attention_mask = t2t_model.common_layers.ones_matrix_band_part(
                    rows=decode_length,
                    cols=decode_length,
                    num_lower=-1,  # attend to everything before
                    num_upper=0,  # attend to nothing after
                    out_shape=[1, decode_length, decode_length
                               ])  # 1, decode_length, decode_length

                # [batch, decode_length, decode_length]
                causal_attention_mask = tf.tile(causal_attention_mask,
                                                [batch_size, 1, 1])

                all_decoder_layers = modeling.transformer_model(
                    input_tensor=decoder_input,
                    attention_mask=causal_attention_mask,
                    hidden_size=hparams.hidden_size,
                    num_hidden_layers=hparams.num_decode_layers,
                    num_attention_heads=hparams.num_attention_heads,
                    intermediate_size=hparams.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        hparams.hidden_act),
                    hidden_dropout_prob=hparams.dropout_prob,
                    attention_probs_dropout_prob=hparams.dropout_prob,
                    initializer_range=hparams.initializer_range,
                    do_return_all_layers=True,
                    attention_top_k=hparams.attention_top_k)

                decoder_output, _ = all_decoder_layers[
                    -1]  # [batch, dec_len, hid_size]
                theorem_feature = decoder_output[:, 0, :]  # [batch, hid_size]
                premise_feature = decoder_output[:,
                                                 1:, :]  # [batch, tar_len, hid_size]

        with tf.variable_scope('prediction', reuse=tf.AUTO_REUSE):
            theorem_logits = tf.keras.layers.Dense(  # [batch, num_theorems]
                name='theorem',
                units=hparams.num_theorems,
                use_bias=True,
                kernel_initializer=modeling.create_initializer(
                    hparams.initializer_range))(theorem_feature)

            premise_logits = tf.matmul(
                a=premise_feature,  # [batch, premise_len, hid_size]
                b=sequence_output,  # [batch, sequence_len, hid_size]
                transpose_b=True,
            )  # [batch, premise_len, sequence_len]

            # [batch * premise_len, sequence_len]
            seq_len = premise_logits.shape.as_list()[-1]
            premise_logits = tf.reshape(premise_logits, [-1, seq_len])

            premise_weights = tf.cast(premise > 0,
                                      tf.float32)  # [batch, prem_len]
            premise_weights = tf.reshape(premise_weights,
                                         [-1])  # [batch * prem_len]
            premise = tf.reshape(premise, [-1, 1])  # [batch * prem_len, 1]

            theorem_loss = tf.losses.sparse_softmax_cross_entropy(
                labels=theorem,  # [batch, 1]
                logits=theorem_logits  # [batch, num_theorems]
            )
            premise_loss = tf.losses.sparse_softmax_cross_entropy(
                labels=premise,  # [batch * premise_len, 1]
                logits=premise_logits,  # [batch * premise_len, sequence_len]
                weights=premise_weights  # [batch * premise_len]
            )

            logits = dict(theorem_logits=theorem_logits,
                          theorem_labels=theorem,
                          premise_logits=premise_logits,
                          premise_labels=premise)

            losses = dict(training=theorem_loss + premise_loss,
                          theorem_loss=theorem_loss,
                          premise_loss=premise_loss)

        return logits, losses
Exemplo n.º 25
0
    tensor_name=None,
    all_tensors=False)

chkp.print_tensors_in_checkpoint_file(
    "output_dir_dupe5_s4_3class/model_795_seq_direction.ckpt",
    tensor_name='cls/seq_direction/output_weights',
    all_tensors=False)
chkp.print_tensors_in_checkpoint_file(
    "output_dir_dupe5_s4/model.ckpt-795000",
    tensor_name='cls/seq_direction/output_bias2',
    all_tensors=False)
with tf.variable_scope("cls/seq_direction"):
    output_weights2 = tf.get_variable(
        "output_weights2",
        shape=[1, 768],
        initializer=modeling.create_initializer(.02))
    output_bias2 = tf.get_variable("output_bias2",
                                   shape=[1],
                                   initializer=tf.zeros_initializer())

# output_weights = chkp.print_tensors_in_checkpoint_file("uncased_L-12_H-768_A-12/bert_model.ckpt",
#                                                        tensor_name='cls/seq_relationship/output_weights',
#                                                        all_tensors=False)

# output_dir_position_pretrain_tf/model.ckpt-34000 bert/embeddings/dependency_embedding
# output_bias = chkp.print_tensors_in_checkpoint_file("uncased_L-12_H-768_A-12/bert_model.ckpt",
#                                                     tensor_name='cls/seq_relationship/output_bias', all_tensors=False)
#reader = pywrap_tensorflow.NewCheckpointReader("uncased_L-12_H-768_A-12/bert_model.ckpt")
#add new embeddinf
# reader.get_tensor('bert/embeddings/token_type_embeddings')
##add task 3
Exemplo n.º 26
0

def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
  """Get loss and log probs for the masked LM."""
  input_tensor = gather_indexes(input_tensor, positions)

  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[bert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    label_ids = tf.reshape(label_ids, [-1])
    label_weights = tf.reshape(label_weights, [-1])
Exemplo n.º 27
0
def gec_create_model(bert_config, is_training, input_sequence, 
  input_mask, segment_ids, edit_sequence, 
  use_one_hot_embeddings, mode, 
  copy_weight, 
  use_bert_more, 
  insert_ids,
  multitoken_insert_ids,
  subtract_replaced_from_replacement):
  """Creates a classification model."""
  # insert_ids: word ids of unigram inserts (list)
  # multitoken_insert_ids: word_ids of bigram inserts (list of tuples of length 2)
  # Defining the space of all possible edits: 
  # unk, sos and eos are dummy edits mapped to 0, 1 and 2 respectively
  # copy is mapped to 3
  # del is mapped to 4
  num_appends = len(insert_ids) + len(multitoken_insert_ids)
  num_replaces = num_appends # appends and replacements come from the same set (inserts and multitoken_inserts)
  append_begin = 5 # First append edit (mapped to 5)
  append_end = append_begin + num_appends - 1 #Last append edit
  rep_begin = append_end + 1 # First replace edit
  rep_end = rep_begin + num_replaces - 1 #Last replace edit  
  num_suffix_transforms = 58 #num of transformation edits
  num_labels = 5 + num_appends + num_replaces + num_suffix_transforms # total number of edits
  print("************ num of labels : {} ***************".format(num_labels))

  config = bert_config
  input_sequence_shape = modeling.get_shape_list(input_sequence,2)
  batch_size = input_sequence_shape[0]
  seq_len = input_sequence_shape[1]

  if not use_bert_more:  #default use of bert (without logit factorisation)
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_sequence,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    output_layer = model.get_sequence_output()
  else:                 # LOGIT FACTORISATION is On!
    model = modified_modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_sequence,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    output_layer = model.get_sequence_output()
    replace_layer = output_layer[:,seq_len:2*seq_len,:]  #representation of replacement slots as described in paper
    append_layer = output_layer[:,2*seq_len:3*seq_len,:] #representation of append slots as described in paper
    output_layer = output_layer[:,0:seq_len,:]

  output_layer_shape = modeling.get_shape_list(output_layer,3)
  hidden_size = output_layer_shape[-1]

  flattened_output_layer = tf.reshape(output_layer,[-1, hidden_size])

  h_edit = flattened_output_layer

  if use_bert_more:
    h_word = flattened_output_layer
    flattened_replace_layer = tf.reshape(replace_layer,[-1, hidden_size])
    flattened_append_layer = tf.reshape(append_layer, [-1, hidden_size])

    m_replace = flattened_replace_layer    
    m_append = flattened_append_layer

    
    with tf.variable_scope("cls/predictions"):
      with tf.variable_scope("transform"):
        h_word = tf.layers.dense(
            h_word,
            units=bert_config.hidden_size,
            activation=modeling.get_activation(bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                bert_config.initializer_range))
        h_word = modeling.layer_norm(h_word)

    with tf.variable_scope("cls/predictions",reuse=True):
      with tf.variable_scope("transform",reuse=True):
        m_replace = tf.layers.dense(
            m_replace,
            units=bert_config.hidden_size,
            activation=modeling.get_activation(bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                bert_config.initializer_range))
        m_replace = modeling.layer_norm(m_replace)

    with tf.variable_scope("cls/predictions",reuse=True):
      with tf.variable_scope("transform",reuse=True):
        m_append = tf.layers.dense(
            m_append,
            units=bert_config.hidden_size,
            activation=modeling.get_activation(bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                bert_config.initializer_range))
        m_append = modeling.layer_norm(m_append)
    
    word_embedded_input = model.word_embedded_input
    flattened_word_embedded_input = tf.reshape(word_embedded_input, [-1, hidden_size])    

  labels = edit_sequence
  
  edit_weights = tf.get_variable(
      "edit_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  if is_training:
    h_edit = tf.nn.dropout(h_edit, keep_prob=0.9) 

  if use_bert_more:
      # append/replace weight vector for a given append or replace operation
      # correspond to word embedding for its token argument
      # for multitoken append/replace (e.g. has been)
      # weight vector is sum of word embeddings of token arguments

      append_weights = edit_word_embedding_lookup(model.embedding_table, insert_ids,
       use_one_hot_embeddings, config.vocab_size, config.hidden_size)
      replace_weights = append_weights #tokens in replace and append vocab are same 
                                       #(i.e. inserts and multitoken_inserts)

      multitoken_append_weights = wem_utils.edit_embedding_loopkup(model.embedding_table, multitoken_insert_ids,
                        use_one_hot_embeddings, config.vocab_size, config.hidden_size)
      multitoken_replace_weights = multitoken_append_weights #tokens in replace and append vocab are same 
                                                             #(i.e. inserts and multitoken_inserts)

      append_weights = tf.concat([append_weights, multitoken_append_weights],0)
      replace_weights = tf.concat([replace_weights, multitoken_replace_weights],0)

  with tf.variable_scope("loss"):
    edit_logits = tf.matmul(h_edit, edit_weights, transpose_b=True) #first term in eq3 in paper
    logits = edit_logits
    if use_bert_more:

      #=============== inplace_word_logits==============# #2nd term in eq3 in paper
      inplace_logit = tf.reduce_sum(h_word * flattened_word_embedded_input, axis=1, keepdims=True) #copy
      #inplace_logit = tf.reduce_sum(m_replace * flattened_word_embedded_input, axis=1, keepdims=True) #copy
      inplace_logit_appends = tf.tile(inplace_logit,[1,num_appends])
      inplace_logit_transforms = tf.tile(inplace_logit,[1,num_suffix_transforms])
      zero_3_logits = tf.zeros([batch_size*seq_len,3]) #unk sos eos 
      zero_1_logits = tf.zeros([batch_size*seq_len,1]) # del
      zero_replace_logits = tf.zeros([batch_size*seq_len,num_replaces])

      concat_list = [zero_3_logits, inplace_logit, zero_1_logits]\
                  + [inplace_logit_appends]\
                  + [zero_replace_logits]\
                  + [inplace_logit_transforms]

      inplace_word_logits = tf.concat(concat_list,1)

      #======additional (insert,replace) logits ====# #3rd term in eqn3 in paper
      zero_5_logits = tf.zeros([batch_size*seq_len,5])      
      append_logits = tf.matmul(m_append, append_weights, transpose_b=True)

      if subtract_replaced_from_replacement:
        replace_logits = replacement_minus_replaced_logits(m_replace, 
          flattened_word_embedded_input, replace_weights)
      else:
        replace_logits = tf.matmul(m_replace, replace_weights, transpose_b=True)
      
      suffix_logits  = tf.zeros([batch_size*seq_len,num_suffix_transforms])
      
      concat_list = [zero_5_logits, append_logits, replace_logits, suffix_logits]
      additional_logits = tf.concat(concat_list,1)
      #====================================================#

      logits = edit_logits + inplace_word_logits + additional_logits
      logits_bias = tf.get_variable("output_bias", shape=[num_labels], initializer=tf.zeros_initializer())
      logits += logits_bias
    
    logits = tf.reshape(logits, [output_layer_shape[0], output_layer_shape[1], num_labels])
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    probs = tf.nn.softmax(logits,axis=-1)
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
    per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    per_token_loss = per_token_loss * tf.to_float(input_mask)
    mask = copy_weight*tf.to_float(tf.equal(labels,3)) +  tf.to_float(tf.not_equal(labels,3))
    masked_per_token_loss = per_token_loss * mask
    per_example_loss = tf.reduce_sum(masked_per_token_loss, axis=-1)
    loss = tf.reduce_mean(per_example_loss)            

    return (loss, per_example_loss, logits, probs)
Exemplo n.º 28
0
    def build_model(self):
        with tf.variable_scope("inferring_module"):
            rdim = 768
            update_num = self.update_num
            batch_size = tf.shape(self.sent1)[0]
            dim = self.sent1.get_shape().as_list()[-1]

            gru_layer = BiGRU(num_layers=1,
                              num_units=rdim,
                              batch_size=batch_size,
                              input_size=dim,
                              keep_prob=0.9,
                              is_train=self.is_training,
                              activation=tf.nn.tanh)
            sent1_len = tf.cast(tf.reduce_sum(self.sent1_mask, axis=1),
                                tf.int32)
            sent2_len = tf.cast(tf.reduce_sum(self.sent2_mask, axis=1),
                                tf.int32)
            self.sent1 = gru_layer(self.sent1, sent1_len)
            self.sent2 = gru_layer(self.sent2, sent2_len)

            sr_cell = GRUCell(num_units=2 * rdim, activation=tf.nn.relu)

            r_cell = sr_cell

            tri_cell = DoubleJointCell(num_units=2 * rdim,
                                       r_cell=r_cell,
                                       sent1=self.sent1,
                                       sent2=self.sent2,
                                       dim=2 * dim,
                                       update_num=update_num,
                                       use_bias=False,
                                       activation=tf.tanh,
                                       dropout_rate=self.dropout_rate,
                                       sent1_mask=self.sent1_mask,
                                       sent2_mask=self.sent2_mask,
                                       initializer=None,
                                       dtype=tf.float32)

            fake_input = tf.tile(tf.expand_dims(self.mark0, axis=1),
                                 [1, update_num, 1])
            self.init_state = tri_cell.zero_state(batch_size=batch_size,
                                                  dtype=tf.float32)

            self.double_output, last_state = dynamic_rnn(
                cell=tri_cell,
                inputs=fake_input,
                initial_state=self.init_state)
            refer_output = tf.reduce_mean(self.double_output,
                                          axis=1)  # (B, dim)
        # temp = tf.concat([refer_output, self.mark0], axis=1)
        #
        # temp = dropout(temp, self.dropout_rate)
        refer_output = tf.layers.dense(
            refer_output,
            768,
            activation=tf.nn.tanh,
            kernel_initializer=create_initializer(0.02))

        # return refer_output * (1 - gate) + gate * self.mark0
        return refer_output + self.mark0
Exemplo n.º 29
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    #bert_config = bert_config, input_tensor = model.get_sequence_output(), output_weights = model.get_embedding_table(), positions = masked_lm_positions, label_ids = masked_lm_ids, label_weights = masked_lm_weights
    # postions参见create_pretraining_data.py中的masked_lm_postions
    # label_ids参见create_pretraining_data.py中的masked_lm_labels
    import ipdb
    ipdb.set_trace()
    # 在计算mlm的时候,先得到整个句子的向量,然后从整个句子的向量选出masked的那15%位置的向量,然后计算损失。因此,有%10的mask要保持不变。否则,根本就不会包含正确的masked的单词,因为那其它85%的单词只参与理解,不参与损失函数的计算。
    # 有10%的mask要替换可能是为了要提高编码器的纠错能力,因为正常的句子中,也可能粗线错误的单词
    # 有80%的呗mask掉主要是锻炼理解能力,能够根据上下文理解当前文本的意思
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        #input_tensor.shpae=(160,768),output_weights.shape=(21128(vocab_size),768)
        logits = tf.matmul(input_tensor, output_weights,
                           transpose_b=True)  #logits.shape=(160,21128)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        #label_ids.shape = (8,20)
        label_ids = tf.reshape(label_ids, [-1])
        #label_ids.shape = (160)
        #label_weights.shape=(8,20)
        label_weights = tf.reshape(label_weights,
                                   [-1])  #label_weights是mask的权重,
        #在本程序中,都是1
        #label_weights.shape=(160,)

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)
        #one_hot_labels.shape=(160,21128),一共160个字符,每个字符用vocab_size的
        #one_hot表示,为下文求loss做准备。

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Exemplo n.º 30
0
def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str, config: dict):
    """
    :param model:BertModel Pytorch model instance to be converted
    :param ckpt_dir: Tensorflow model directory
    :param model_name: model name
    :return:
    Currently supported HF models:
        Y BertModel
        N BertForMaskedLM
        N BertForPreTraining
        N BertForMultipleChoice
        N BertForNextSentencePrediction
        N BertForSequenceClassification
        N BertForQuestionAnswering
    """

    tensors_to_transpose = (
        "dense.weight",
        "attention.self.query",
        "attention.self.key",
        "attention.self.value"
    )

    var_map = (
        ('layer.', 'layer_'),
        ('word_embeddings.weight', 'word_embeddings'),
        ('position_embeddings.weight', 'position_embeddings'),
        ('token_type_embeddings.weight', 'token_type_embeddings'),
        ('.', '/'),
        ('LayerNorm/weight', 'LayerNorm/gamma'),
        ('LayerNorm/bias', 'LayerNorm/beta'),
        ('weight', 'kernel')
    )

    if not os.path.isdir(ckpt_dir):
        os.makedirs(ckpt_dir)

    state_dict = model.state_dict()

    def to_tf_var_name(name: str):
        for patt, repl in iter(var_map):
            name = name.replace(patt, repl)
        return 'bert/{}'.format(name)

    def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
        tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
        session.run(tf.variables_initializer([tf_var]))
        session.run(tf_var)
        return tf_var

    tf.reset_default_graph()
    with tf.Session() as session:
        for var_name in state_dict:
            tf_name = to_tf_var_name(var_name)
            torch_tensor = state_dict[var_name].numpy()
            if 'token_type_embeddings' in tf_name:
                torch_tensor = np.tile(torch_tensor, [config['type_vocab_size'], 1])
            if 'word_embeddings' in tf_name:
                add_emb_shape = config['vocab_size'] - torch_tensor.shape[0]
                embedding_table = tf.get_variable(name='additional_emb',
                                                  shape=[add_emb_shape, torch_tensor.shape[1]],
                                                  initializer=create_initializer(config['initializer_range']))
                embedding_table.initializer.run()
                additional_emb = embedding_table.eval()
                torch_tensor = np.concatenate([torch_tensor, additional_emb], axis=0)

            if any([x in var_name for x in tensors_to_transpose]):
                torch_tensor = torch_tensor.T
            tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
            tf.keras.backend.set_value(tf_var, torch_tensor)
            tf_weight = session.run(tf_var)
            print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))

        saver = tf.train.Saver(tf.trainable_variables())
        saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
Exemplo n.º 31
0
    def build_model(self):
        from layers.ParallelInfo import TextCNN, RNNExtract, InteractionExtract, SingleSentenceExtract

        with tf.variable_scope("inferring_module"), tf.device("/device:GPU:0"):
            rdim = 768

            batch_size = tf.shape(self.sent1)[0]
            sent_length = self.all_sent.get_shape().as_list()[1]
            update_num = 3
            dim = self.sent1.get_shape().as_list()[-1]

            gru_layer = BiGRU(num_layers=1,
                              num_units=rdim,
                              batch_size=batch_size,
                              input_size=dim,
                              keep_prob=0.9,
                              is_train=self.is_training,
                              activation=tf.nn.tanh)
            seq_len = tf.reduce_sum(self.input_mask, axis=1)
            gru_output = gru_layer(self.all_sent, seq_len=seq_len)

            with tf.variable_scope("att"):
                all_seq_len = self.all_sent.get_shape().as_list()[1]
                cls = tf.tile(tf.expand_dims(self.mark0, axis=1),
                              [1, all_seq_len, 1])
                cat_att = tf.concat([cls, gru_output], axis=2)

                res = tf.layers.dense(cat_att,
                                      units=512,
                                      activation=tf.nn.relu)
                res = tf.layers.dense(res, units=1, use_bias=False)
                res_mask = tf.expand_dims(tf.cast(self.input_mask, tf.float32),
                                          axis=2)
                res = res - (1 - res_mask) * 10000.0

                alpha = tf.nn.softmax(res, 1)
                gru_vec = tf.reduce_sum(alpha * gru_output, axis=1)

            # gru_vec = dropout(gru_vec, self.dropout_rate)
            gru_vec = tf.layers.dense(
                gru_vec,
                768,
                activation=gelu,
                kernel_initializer=create_initializer(0.02))
            gru_vec = dropout(gru_vec, self.dropout_rate)
            gru_vec = layer_norm(gru_vec + self.mark0)
            gru_vec = tf.layers.dense(
                gru_vec,
                768,
                activation=tf.tanh,
                kernel_initializer=create_initializer(0.02))

            text_cnn = TextCNN(2 * rdim, [1, 2, 3, 4, 5, 7], 128)
            img_ext = InteractionExtract(num_units=256, seq_len=sent_length)

            text_vec = text_cnn(gru_output, mask=self.input_mask)
            # rnn_vec, rnn_att = rnn_ext(self.all_sent, input_mask=self.input_mask, mark0=self.mark0)
            img_vec = img_ext(gru_output, self.sent1_mask, self.sent2_mask,
                              self.dropout_rate)

            temp_res = tf.concat([img_vec, gru_vec, text_vec], axis=1)

            return tf.layers.dense(temp_res,
                                   768,
                                   tf.tanh,
                                   kernel_initializer=create_initializer(0.02))
def create_model_old(bert_config, is_training, input_ids_1, input_mask_1,
                     segment_ids_1, input_ids_2, input_mask_2, segment_ids_2,
                     labels, keep_prob, num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model_1 = modeling.BertModel(config=bert_config,
                                 is_training=is_training,
                                 input_ids=input_ids_1,
                                 input_mask=input_mask_1,
                                 token_type_ids=segment_ids_1,
                                 use_one_hot_embeddings=use_one_hot_embeddings,
                                 scope="bert")

    model_2 = modeling.BertModel(config=bert_config,
                                 is_training=is_training,
                                 input_ids=input_ids_2,
                                 input_mask=input_mask_2,
                                 token_type_ids=segment_ids_2,
                                 use_one_hot_embeddings=use_one_hot_embeddings,
                                 scope="bert")

    # In the demo, we are doing a simple classification task on the entire
    # segment.
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    output_layer_1 = model_1.get_pooled_output()
    print(output_layer_1.shape)
    output_layer_2 = model_2.get_pooled_output()
    print(output_layer_2.shape)

    # 最后进行拼接(前面也可以新增一些其他网络层)
    output_layer = tf.concat([output_layer_1, output_layer_2], axis=-1)

    # 最后进行拼接(前面也可以新增一些其他网络层)
    output_layer = tf.layers.dense(
        output_layer,
        bert_config.hidden_size,
        activation=tf.nn.relu,
        kernel_initializer=modeling.create_initializer(
            bert_config.initializer_range))

    hidden_size = output_layer.shape[-1].value
    print(output_layer.shape)

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=keep_prob)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        probabilities = tf.nn.softmax(logits, axis=-1)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)

        return (loss, per_example_loss, logits, probabilities)