예제 #1
0
def get_masked_lm_output(albert_config, input_tensor, output_weights,
                         positions, label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=albert_config.embedding_size,
                activation=modeling.get_activation(albert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    albert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[albert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=albert_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
예제 #2
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
  """Get loss and log probs for the masked LM."""
  input_tensor = gather_indexes(input_tensor, positions)

  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[bert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    label_ids = tf.reshape(label_ids, [-1])
    label_weights = tf.reshape(label_weights, [-1])

    one_hot_labels = tf.one_hot(
        label_ids, depth=bert_config.vocab_size, dtype=tf.float32)

    # The `positions` tensor might be zero-padded (if the sequence is too
    # short to have the maximum number of predictions). The `label_weights`
    # tensor has a value of 1.0 for every real prediction and 0.0 for the
    # padding predictions.
    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
    numerator = tf.reduce_sum(label_weights * per_example_loss)
    denominator = tf.reduce_sum(label_weights) + 1e-5
    loss = numerator / denominator

  return (loss, per_example_loss, log_probs)
    def __init__(self, config, input_embedding, attention_mask):
        # Keep variable names the same as BERT
        with tf.variable_scope("bert"):
            with tf.variable_scope("encoder"):
                all_encoder_layers = modeling.transformer_model(
                    input_tensor=input_embedding,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

                self.sequence_output = all_encoder_layers[-1]
예제 #4
0
def get_mlm_output(input_tensor, albert_config, mlm_positions, output_weights,
                   label_ids, label_weights):
    """From run_pretraining.py."""
    input_tensor = gather_indexes(input_tensor, mlm_positions)
    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=albert_config.embedding_size,
                activation=modeling.get_activation(albert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    albert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[albert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [1, -1])
        one_hot_labels = tf.one_hot(label_ids,
                                    depth=albert_config.vocab_size,
                                    dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

        masked_lm_log_probs = tf.reshape(log_probs, [-1, log_probs.shape[-1]])
        masked_lm_predictions = tf.argmax(masked_lm_log_probs,
                                          axis=-1,
                                          output_type=tf.int32)
        # return masked_lm_predictions
        return loss, per_example_loss
def get_solubility_output(bert_config, input_tensor, positions,
                         label_solubilities, label_weights, k=3, log=False):
  """Get loss and log probs for the solubility prediction."""
  input_tensor = gather_indexes(input_tensor, positions)
  solubility_range = 100*k + 1

  with tf.variable_scope("cls/solubility"):
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)


    output_weights = tf.get_variable(
        "output_weights",
        shape=[solubility_range, bert_config.hidden_size],
        initializer=modeling.create_initializer(bert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias",
        shape=[solubility_range],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    label_solubilities = tf.reshape(label_solubilities, [-1])
    label_weights = tf.reshape(label_weights, [-1])

    one_hot_labels = tf.one_hot(
        label_solubilities, depth=solubility_range, dtype=tf.float32)

    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
    numerator = tf.reduce_sum(label_weights * per_example_loss)
    denominator = tf.reduce_sum(label_weights) + 1e-5
    loss = numerator / denominator

  return (loss, per_example_loss, log_probs)
    def __init__(self, config, tf_dtype, input_hidden, embedding_table):
        # Keep variable names the same as BERT
        with tf.variable_scope("cls"):
            with tf.variable_scope("predictions"):
                with tf.variable_scope("transform"):
                    self.transformed_output = tf.layers.dense(
                        input_hidden,
                        config.hidden_size,
                        activation=modeling.get_activation(config.hidden_act),
                        kernel_initializer=modeling.create_initializer(
                            config.initializer_range))
                    self.transformed_output = modeling.layer_norm(
                        self.transformed_output)

                output_bias = tf.Variable(tf.zeros([config.vocab_size],
                                                   dtype=tf_dtype),
                                          name="output_bias",
                                          dtype=tf_dtype)
                self.final_output = tf.add(
                    tf.matmul(self.transformed_output,
                              tf.transpose(embedding_table)), output_bias)
                self.probs = tf.nn.softmax(self.final_output,
                                           name='token_probs')
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights):
    """From run_pretraining.py."""
    input_tensor = gather_indexes(input_tensor, mlm_positions)
    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=albert_config.embedding_size,
                activation=modeling.get_activation(albert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    albert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[albert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
    return logits
예제 #8
0
def get_shuffle_loss(model_config, seq_output, label_ids, label_weights):
    sequence_shape = modeling.get_shape_list(seq_output, expected_rank=[3])
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    seq_output = tf.reshape(seq_output, [-1, width])
    with tf.variable_scope("cls/shuffle"):
        with tf.variable_scope("transform"):
            seq_output = tf.layers.dense(
                seq_output,
                units=seq_length,
                activation=modeling.get_activation(model_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    model_config.initializer_range))
            seq_output = modeling.layer_norm(seq_output)

        output_bias = tf.get_variable("output_bias",
                                      shape=[seq_length],
                                      initializer=tf.zeros_initializer())

        logits = tf.nn.bias_add(seq_output, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(tf.cast(label_weights, tf.float32), [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=seq_length,
                                    dtype=tf.float32)

        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return loss, per_example_loss, log_probs
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]])
        # TODO: dynamic gather from per_example_loss
    return loss
예제 #10
0
def get_logits(bert_config, input_tensor, output_weights, positions):
  """Get logits for the masked LM."""
  input_tensor = gather_indexes(input_tensor, positions)

  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[bert_config.vocab_size],
        initializer=tf.zeros_initializer())

    if bert_config.hidden_size != bert_config.embedding_size:
      extra_output_weights = tf.get_variable(
          name="extra_output_weights",
          shape=[
              bert_config.vocab_size,
              bert_config.hidden_size - bert_config.embedding_size],
          initializer=modeling.create_initializer(
              bert_config.initializer_range))
      output_weights = tf.concat(
          [output_weights, extra_output_weights], axis=1)
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    return logits
예제 #11
0
파일: main.py 프로젝트: eastonYi/LM_EVAL
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

    return log_probs
    def feed_neural_work(self):
        '''
        input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False'''
        # `sequence_output` shape = [batch_size, seq_length, hidden_size].
        self.all_encoder_layers, self.context_bias = modeling.transformer_model(
            self.embedded_chars_q,
            attention_mask=self.attention_mask,
            hidden_size=self.config.hidden_size,
            num_hidden_layers=self.config.num_hidden_layers,
            num_attention_heads=self.config.num_attention_heads,
            intermediate_size=self.config.intermediate_size,
            intermediate_act_fn=modeling.get_activation(
                self.config.hidden_act),
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            initializer_range=self.config.initializer_range,
            do_return_all_layers=True,
            t5_relative_bias=self.t5_att_bias)
        self.sequence_output = self.all_encoder_layers[-1]
        # The "pooler" converts the encoded sequence tensor of shape
        # [batch_size, seq_length, hidden_size] to a tensor of shape
        # [batch_size, hidden_size]. This is necessary for segment-level
        # (or segment-pair-level) classification tasks where we need a fixed
        # dimensional representation of the segment.
        with tf.variable_scope("pooler"):
            # We "pool" the model by simply taking the hidden state corresponding
            # to the first token. We assume that this has been pre-trained

            if self.transformer_ret_pooling == "mean":
                print('self.seq_lent:', self.seq_lent)
                print('tf.reduce_sum(self.sequence_output,axis=1):',
                      tf.reduce_sum(self.sequence_output, axis=1))

                self.pooled_output = tf.reduce_sum(self.sequence_output,
                                                   axis=1) * self.seq_lent
            elif self.transformer_ret_pooling == "last":
                self.pooled_output = self.sequence_output[:, -1, :]
            elif self.transformer_ret_pooling == "max":
                self.pooled_output = tf.reduce_max(self.sequence_output,
                                                   axis=1)
            else:
                print('wrong transformer_ret_pooling:',
                      self.transformer_ret_pooling)
                exit(0)

            if 'adding_problem' not in self.dataset:
                #we add dropout for pooled_output
                self.pooled_output = modeling.layer_norm(
                    tf.nn.dropout(self.pooled_output,
                                  keep_prob=1.0 - self.input_dropout_prob))

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[self.config.hidden_size, self.max_input_right],
                initializer=initializer())
            b = tf.Variable(tf.constant(0.1, shape=[self.max_input_right]),
                            name="b")
            l2_loss = tf.constant(0.0)
            l2_loss += tf.nn.l2_loss(W)
            self.scores = tf.nn.xw_plus_b(self.pooled_output,
                                          W,
                                          b,
                                          name="scores")
            print(self.scores)

            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        if 'adding_problem' not in self.dataset:
            # Calculate mean cross-entropy loss
            with tf.name_scope("loss"):
                losses = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.scores, labels=self.input_y)
                self.l2_loss = l2_loss * self.l2_reg_lambda
                self.loss = tf.reduce_mean(losses) + self.l2_loss
            # Accuracy
            with tf.name_scope("accuracy"):
                correct_predictions = tf.equal(self.predictions,
                                               tf.argmax(self.input_y, 1))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                       "float"),
                                               name="accuracy")
        else:
            with tf.name_scope("loss"):
                losses = tf.nn.l2_loss(self.scores -
                                       tf.expand_dims(self.input_y, -1))
                print('losses:', losses)

                self.l2_loss = self.l2_reg_lambda * l2_loss
                self.loss = tf.reduce_mean(losses) + self.l2_loss * 1e-3

            with tf.name_scope("accuracy"):
                correct_predictions = tf.less_equal(
                    tf.abs(self.scores[:, 0] - self.input_y),
                    tf.constant([0.04]))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                       "float"),
                                               name="accuracy")
예제 #13
0
def gec_create_model(bert_config, is_training, input_sequence, 
  input_mask, segment_ids, edit_sequence, 
  use_one_hot_embeddings, mode, 
  copy_weight, 
  use_bert_more, 
  insert_ids,
  multitoken_insert_ids,
  subtract_replaced_from_replacement):
  """Creates a classification model."""
  # insert_ids: word ids of unigram inserts (list)
  # multitoken_insert_ids: word_ids of bigram inserts (list of tuples of length 2)
  # Defining the space of all possible edits: 
  # unk, sos and eos are dummy edits mapped to 0, 1 and 2 respectively
  # copy is mapped to 3
  # del is mapped to 4
  num_appends = len(insert_ids) + len(multitoken_insert_ids)
  num_replaces = num_appends # appends and replacements come from the same set (inserts and multitoken_inserts)
  append_begin = 5 # First append edit (mapped to 5)
  append_end = append_begin + num_appends - 1 #Last append edit
  rep_begin = append_end + 1 # First replace edit
  rep_end = rep_begin + num_replaces - 1 #Last replace edit  
  num_suffix_transforms = 58 #num of transformation edits
  num_labels = 5 + num_appends + num_replaces + num_suffix_transforms # total number of edits
  print("************ num of labels : {} ***************".format(num_labels))

  config = bert_config
  input_sequence_shape = modeling.get_shape_list(input_sequence,2)
  batch_size = input_sequence_shape[0]
  seq_len = input_sequence_shape[1]

  if not use_bert_more:  #default use of bert (without logit factorisation)
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_sequence,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    output_layer = model.get_sequence_output()
  else:                 # LOGIT FACTORISATION is On!
    model = modified_modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_sequence,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    output_layer = model.get_sequence_output()
    replace_layer = output_layer[:,seq_len:2*seq_len,:]  #representation of replacement slots as described in paper
    append_layer = output_layer[:,2*seq_len:3*seq_len,:] #representation of append slots as described in paper
    output_layer = output_layer[:,0:seq_len,:]

  output_layer_shape = modeling.get_shape_list(output_layer,3)
  hidden_size = output_layer_shape[-1]

  flattened_output_layer = tf.reshape(output_layer,[-1, hidden_size])

  h_edit = flattened_output_layer

  if use_bert_more:
    h_word = flattened_output_layer
    flattened_replace_layer = tf.reshape(replace_layer,[-1, hidden_size])
    flattened_append_layer = tf.reshape(append_layer, [-1, hidden_size])

    m_replace = flattened_replace_layer    
    m_append = flattened_append_layer

    
    with tf.variable_scope("cls/predictions"):
      with tf.variable_scope("transform"):
        h_word = tf.layers.dense(
            h_word,
            units=bert_config.hidden_size,
            activation=modeling.get_activation(bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                bert_config.initializer_range))
        h_word = modeling.layer_norm(h_word)

    with tf.variable_scope("cls/predictions",reuse=True):
      with tf.variable_scope("transform",reuse=True):
        m_replace = tf.layers.dense(
            m_replace,
            units=bert_config.hidden_size,
            activation=modeling.get_activation(bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                bert_config.initializer_range))
        m_replace = modeling.layer_norm(m_replace)

    with tf.variable_scope("cls/predictions",reuse=True):
      with tf.variable_scope("transform",reuse=True):
        m_append = tf.layers.dense(
            m_append,
            units=bert_config.hidden_size,
            activation=modeling.get_activation(bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                bert_config.initializer_range))
        m_append = modeling.layer_norm(m_append)
    
    word_embedded_input = model.word_embedded_input
    flattened_word_embedded_input = tf.reshape(word_embedded_input, [-1, hidden_size])    

  labels = edit_sequence
  
  edit_weights = tf.get_variable(
      "edit_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  if is_training:
    h_edit = tf.nn.dropout(h_edit, keep_prob=0.9) 

  if use_bert_more:
      # append/replace weight vector for a given append or replace operation
      # correspond to word embedding for its token argument
      # for multitoken append/replace (e.g. has been)
      # weight vector is sum of word embeddings of token arguments

      append_weights = edit_word_embedding_lookup(model.embedding_table, insert_ids,
       use_one_hot_embeddings, config.vocab_size, config.hidden_size)
      replace_weights = append_weights #tokens in replace and append vocab are same 
                                       #(i.e. inserts and multitoken_inserts)

      multitoken_append_weights = wem_utils.edit_embedding_loopkup(model.embedding_table, multitoken_insert_ids,
                        use_one_hot_embeddings, config.vocab_size, config.hidden_size)
      multitoken_replace_weights = multitoken_append_weights #tokens in replace and append vocab are same 
                                                             #(i.e. inserts and multitoken_inserts)

      append_weights = tf.concat([append_weights, multitoken_append_weights],0)
      replace_weights = tf.concat([replace_weights, multitoken_replace_weights],0)

  with tf.variable_scope("loss"):
    edit_logits = tf.matmul(h_edit, edit_weights, transpose_b=True) #first term in eq3 in paper
    logits = edit_logits
    if use_bert_more:

      #=============== inplace_word_logits==============# #2nd term in eq3 in paper
      inplace_logit = tf.reduce_sum(h_word * flattened_word_embedded_input, axis=1, keepdims=True) #copy
      #inplace_logit = tf.reduce_sum(m_replace * flattened_word_embedded_input, axis=1, keepdims=True) #copy
      inplace_logit_appends = tf.tile(inplace_logit,[1,num_appends])
      inplace_logit_transforms = tf.tile(inplace_logit,[1,num_suffix_transforms])
      zero_3_logits = tf.zeros([batch_size*seq_len,3]) #unk sos eos 
      zero_1_logits = tf.zeros([batch_size*seq_len,1]) # del
      zero_replace_logits = tf.zeros([batch_size*seq_len,num_replaces])

      concat_list = [zero_3_logits, inplace_logit, zero_1_logits]\
                  + [inplace_logit_appends]\
                  + [zero_replace_logits]\
                  + [inplace_logit_transforms]

      inplace_word_logits = tf.concat(concat_list,1)

      #======additional (insert,replace) logits ====# #3rd term in eqn3 in paper
      zero_5_logits = tf.zeros([batch_size*seq_len,5])      
      append_logits = tf.matmul(m_append, append_weights, transpose_b=True)

      if subtract_replaced_from_replacement:
        replace_logits = replacement_minus_replaced_logits(m_replace, 
          flattened_word_embedded_input, replace_weights)
      else:
        replace_logits = tf.matmul(m_replace, replace_weights, transpose_b=True)
      
      suffix_logits  = tf.zeros([batch_size*seq_len,num_suffix_transforms])
      
      concat_list = [zero_5_logits, append_logits, replace_logits, suffix_logits]
      additional_logits = tf.concat(concat_list,1)
      #====================================================#

      logits = edit_logits + inplace_word_logits + additional_logits
      logits_bias = tf.get_variable("output_bias", shape=[num_labels], initializer=tf.zeros_initializer())
      logits += logits_bias
    
    logits = tf.reshape(logits, [output_layer_shape[0], output_layer_shape[1], num_labels])
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    probs = tf.nn.softmax(logits,axis=-1)
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
    per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    per_token_loss = per_token_loss * tf.to_float(input_mask)
    mask = copy_weight*tf.to_float(tf.equal(labels,3)) +  tf.to_float(tf.not_equal(labels,3))
    masked_per_token_loss = per_token_loss * mask
    per_example_loss = tf.reduce_sum(masked_per_token_loss, axis=-1)
    loss = tf.reduce_mean(per_example_loss)            

    return (loss, per_example_loss, logits, probs)
예제 #14
0
    def body(self, features):
        hparams = self.hparams
        if not self.is_training:
            hparams.dropout_prob = 0.0

        with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
            # attention_weights: [batch, n_head, from_len, to_len]
            sequence_output, cls_vector, attention_weights = self.build_encoder(
                features)

        if 'targets' not in features:
            assert self.hparams.dropout_prob == 0.0
            logits, losses = self.greedy_decode_8steps(cls_vector,
                                                       sequence_output)
            logits.update(attention_weights=attention_weights[:, :, 0, :])
            return logits, losses

        with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
            with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE):
                premise = features[
                    'targets']  # [batch, premise_len=8] -bad naming:(
                # [batch, premise_len, hid_size]
                premise_vecs = premise_gather_nd(sequence_output, premise)

                batch_size = tf.shape(premise)[0]
                premise_len = premise.shape.as_list()[-1]
                theorem = features['theorem']  # batch, 1

                # [batch, 1, hid_size] and [num_theorems, hid_size]
                theorem_vec, theorem_emb_table = modeling.embedding_lookup(
                    input_ids=theorem,  # [batch, 1]
                    vocab_size=hparams.num_theorems,
                    embedding_size=hparams.hidden_size,
                    initializer_range=hparams.initializer_range,
                    word_embedding_name='theorem_embedding',
                )
                depth = features['depth']  # batch, 1

                decoder_input = tf.concat(
                    [
                        cls_vector,  # [batch, 1, hid_size]
                        theorem_vec,  # [batch, 1, hid_size]
                        premise_vecs[:, :
                                     -1, :]  # [batch, premise_len-1, hid_size]
                    ],
                    axis=1)  # [batch, premise_len + 1, hid_size]
                decode_length = decoder_input.shape.as_list()[1]
                assert decode_length == premise_len + 1

                # [decode_length, hid_size]
                pos_embedding, _ = modeling.embedding_lookup(
                    input_ids=tf.range(decode_length),  # [decode_length]
                    vocab_size=hparams.max_premise,  # >= premise_len
                    embedding_size=hparams.hidden_size,
                    initializer_range=hparams.initializer_range,
                    word_embedding_name='positional_embedding',
                )
                pos_embedding = tf.reshape(
                    pos_embedding, [1, decode_length, hparams.hidden_size])

                decoder_input = modeling.layer_norm_and_dropout(
                    decoder_input +  # [batch, decode_length, hid_size]
                    pos_embedding,  # [1,     decode_length, hid_size]
                    hparams.dropout_prob)  # [batch, decode_length, hid_size]

            with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE):
                causal_attention_mask = t2t_model.common_layers.ones_matrix_band_part(
                    rows=decode_length,
                    cols=decode_length,
                    num_lower=-1,  # attend to everything before
                    num_upper=0,  # attend to nothing after
                    out_shape=[1, decode_length, decode_length
                               ])  # 1, decode_length, decode_length

                # [batch, decode_length, decode_length]
                causal_attention_mask = tf.tile(causal_attention_mask,
                                                [batch_size, 1, 1])

                all_decoder_layers = modeling.transformer_model(
                    input_tensor=decoder_input,
                    attention_mask=causal_attention_mask,
                    hidden_size=hparams.hidden_size,
                    num_hidden_layers=hparams.num_decode_layers,
                    num_attention_heads=hparams.num_attention_heads,
                    intermediate_size=hparams.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        hparams.hidden_act),
                    hidden_dropout_prob=hparams.dropout_prob,
                    attention_probs_dropout_prob=hparams.dropout_prob,
                    initializer_range=hparams.initializer_range,
                    do_return_all_layers=True,
                    attention_top_k=hparams.attention_top_k)

                decoder_output, _ = all_decoder_layers[
                    -1]  # [batch, dec_len, hid_size]
                theorem_feature = decoder_output[:, 0, :]  # [batch, hid_size]
                premise_feature = decoder_output[:,
                                                 1:, :]  # [batch, tar_len, hid_size]

        with tf.variable_scope('prediction', reuse=tf.AUTO_REUSE):
            theorem_logits = tf.keras.layers.Dense(  # [batch, num_theorems]
                name='theorem',
                units=hparams.num_theorems,
                use_bias=True,
                kernel_initializer=modeling.create_initializer(
                    hparams.initializer_range))(theorem_feature)

            premise_logits = tf.matmul(
                a=premise_feature,  # [batch, premise_len, hid_size]
                b=sequence_output,  # [batch, sequence_len, hid_size]
                transpose_b=True,
            )  # [batch, premise_len, sequence_len]

            # [batch * premise_len, sequence_len]
            seq_len = premise_logits.shape.as_list()[-1]
            premise_logits = tf.reshape(premise_logits, [-1, seq_len])

            premise_weights = tf.cast(premise > 0,
                                      tf.float32)  # [batch, prem_len]
            premise_weights = tf.reshape(premise_weights,
                                         [-1])  # [batch * prem_len]
            premise = tf.reshape(premise, [-1, 1])  # [batch * prem_len, 1]

            theorem_loss = tf.losses.sparse_softmax_cross_entropy(
                labels=theorem,  # [batch, 1]
                logits=theorem_logits  # [batch, num_theorems]
            )
            premise_loss = tf.losses.sparse_softmax_cross_entropy(
                labels=premise,  # [batch * premise_len, 1]
                logits=premise_logits,  # [batch * premise_len, sequence_len]
                weights=premise_weights  # [batch * premise_len]
            )

            logits = dict(theorem_logits=theorem_logits,
                          theorem_labels=theorem,
                          premise_logits=premise_logits,
                          premise_labels=premise)

            losses = dict(training=theorem_loss + premise_loss,
                          theorem_loss=theorem_loss,
                          premise_loss=premise_loss)

        return logits, losses
    def feed_neural_work(self):
        '''
        input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False'''
        # `sequence_output` shape = [batch_size, seq_length, hidden_size].
        self.all_encoder_layers, self.context_bias = modeling.transformer_model(
            self.embedded_chars_q,
            attention_mask=self.attention_mask,
            hidden_size=self.config.hidden_size,
            num_hidden_layers=self.config.num_hidden_layers,
            num_attention_heads=self.config.num_attention_heads,
            intermediate_size=self.config.intermediate_size,
            intermediate_act_fn=modeling.get_activation(
                self.config.hidden_act),
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            initializer_range=self.config.initializer_range,
            do_return_all_layers=True,
            t5_relative_bias=self.t5_att_bias)
        self.sequence_output = self.all_encoder_layers[-1]

        with tf.variable_scope("pooler"):
            if self.transformer_ret_pooling == "mean":
                print('self.seq_lent:', self.seq_lent)
                print('tf.reduce_sum(self.sequence_output,axis=1):',
                      tf.reduce_sum(self.sequence_output, axis=1))

                self.pooled_output = tf.reduce_sum(self.sequence_output,
                                                   axis=1) * self.seq_lent
            elif self.transformer_ret_pooling == "last":
                self.pooled_output = self.sequence_output[:, -1, :]
            elif self.transformer_ret_pooling == "max":
                self.pooled_output = tf.reduce_max(self.sequence_output,
                                                   axis=1)
            else:
                print('wrong transformer_ret_pooling:',
                      self.transformer_ret_pooling)
                exit(0)

            #we add dropout for pooled_output
            if 'adding_problem' not in self.dataset:
                self.pooled_output = modeling.layer_norm(
                    tf.nn.dropout(self.pooled_output,
                                  keep_prob=1.0 - self.input_dropout_prob))

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[self.config.hidden_size, self.max_input_right],
                initializer=initializer(),
            )
            b = tf.Variable(tf.constant(0.1, shape=[self.max_input_right]),
                            name="b")
            l2_loss = tf.constant(0.0)
            l2_loss += tf.nn.l2_loss(W)

            self.scores = tf.nn.xw_plus_b(self.pooled_output,
                                          W,
                                          b,
                                          name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        if 'adding_problem' not in self.dataset:
            # Calculate mean cross-entropy loss
            with tf.name_scope("loss"):
                self.l2_loss = self.l2_reg_lambda * l2_loss
                losses = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.scores, labels=self.input_y)

                self.loss = tf.reduce_mean(losses)  #+ self.l2_loss
            # Accuracy
            with tf.name_scope("accuracy"):
                correct_predictions = tf.equal(self.predictions,
                                               tf.argmax(self.input_y, 1))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                       "float"),
                                               name="accuracy")
        else:
            with tf.name_scope("loss"):
                self.l2_loss = self.l2_reg_lambda * l2_loss
                losses = tf.nn.l2_loss(self.scores -
                                       tf.expand_dims(self.input_y, -1))
                print('losses:', losses)
                self.loss = tf.reduce_mean(losses)  #+ self.l2_loss

            with tf.name_scope("accuracy"):
                correct_predictions = tf.less_equal(
                    tf.abs(self.scores[:, 0] - self.input_y),
                    tf.constant([0.04]))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                       "float"),
                                               name="accuracy")
예제 #16
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    # input_tensor:[batch_size, seq_length, hidden_size]
    # positions:[batch_size, mask_num]
    # output_weights: [vocab_size, embedding_size]
    # -> input_tensor:[batch_size*mask_num, hidden_size]
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            # 在输出之前添加一个非线性变换,只在预训练阶段起作用
            # new input_tensor:[batch_size*mask_num, hidden_size]
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            # new input_tensor:[batch_size*mask, hidden_size]
            input_tensor = modeling.layer_norm(input_tensor)

        tf.logging.info("input tensor shape after transform:{}".format(
            input_tensor.shape))

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        # output_bias:[vocab_size,]
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())

        # input_tensor:[batch_size*mask_num, hidden_size]
        # output weights: [vocab_size, embedding_size=hidden_size]
        # logits:[batch_size*mask_num, vocab_size]
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        # output_bias:[vocab_size]
        logits = tf.nn.bias_add(logits, output_bias)
        # log_probs:[batch_size*mask_num, vocab_size]
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        #label_ids:[batch_size, mask_num]
        #new label_ids:[batch_size*mask_num, 1]
        label_ids = tf.reshape(label_ids, [-1])
        #new label_weights:[batch_size*mask_num, 1]
        label_weights = tf.reshape(label_weights, [-1])

        # one_hot_labels:[batch_size*mask_num, vocab_size]
        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.

        # log_probs:[batch_size*mask_num, vocab_size]
        # one_hot_labels:[batch_size*mask_num, vocab_size]
        # per_example_loss:[batch_size*mask,]
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])  # cross-entropy loss
        # 乘以样本权重
        #label_weights:[batch_size*mask, 1]
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        # 样本权重归一化后的loss
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
예제 #17
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    #bert_config = bert_config, input_tensor = model.get_sequence_output(), output_weights = model.get_embedding_table(), positions = masked_lm_positions, label_ids = masked_lm_ids, label_weights = masked_lm_weights
    # postions参见create_pretraining_data.py中的masked_lm_postions
    # label_ids参见create_pretraining_data.py中的masked_lm_labels
    import ipdb
    ipdb.set_trace()
    # 在计算mlm的时候,先得到整个句子的向量,然后从整个句子的向量选出masked的那15%位置的向量,然后计算损失。因此,有%10的mask要保持不变。否则,根本就不会包含正确的masked的单词,因为那其它85%的单词只参与理解,不参与损失函数的计算。
    # 有10%的mask要替换可能是为了要提高编码器的纠错能力,因为正常的句子中,也可能粗线错误的单词
    # 有80%的呗mask掉主要是锻炼理解能力,能够根据上下文理解当前文本的意思
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        #input_tensor.shpae=(160,768),output_weights.shape=(21128(vocab_size),768)
        logits = tf.matmul(input_tensor, output_weights,
                           transpose_b=True)  #logits.shape=(160,21128)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        #label_ids.shape = (8,20)
        label_ids = tf.reshape(label_ids, [-1])
        #label_ids.shape = (160)
        #label_weights.shape=(8,20)
        label_weights = tf.reshape(label_weights,
                                   [-1])  #label_weights是mask的权重,
        #在本程序中,都是1
        #label_weights.shape=(160,)

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)
        #one_hot_labels.shape=(160,21128),一共160个字符,每个字符用vocab_size的
        #one_hot表示,为下文求loss做准备。

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
예제 #18
0
def main(args):
    bert_config = modeling.BertConfig.from_json_file(args.config)
    bert_config.hidden_dropout_prob = 0.0
    bert_config.attention_probs_dropout_prob = 0.0

    batch_size = args.batch_size
    avg_seq_len = args.avg_seq_length
    max_seq_len = args.max_seq_length
    tf_dtype = tf.float16 if args.precision == 'fp16' else tf.float32

    # fake input array length
    input_len = np.random.randint(low=2 * avg_seq_len - max_seq_len,
                                  high=max_seq_len + 1,
                                  size=(batch_size),
                                  dtype=np.int32)
    valid_word_num = sum(input_len)

    # fake input id and mask
    input_ids = np.random.randint(low=0,
                                  high=bert_config.vocab_size,
                                  size=(batch_size, max_seq_len),
                                  dtype=np.int32)
    input_mask = np.zeros((batch_size, max_seq_len), dtype=np.int32)
    for b_idx, s_len in enumerate(input_len):
        input_mask[b_idx][:s_len] = 1

    input_ids_tensor = tf.convert_to_tensor(input_ids, dtype=tf.int32)
    input_mask_tensor = tf.convert_to_tensor(input_mask, dtype=tf.int32)

    # fake embedding output
    embed_output = np.random.randn(batch_size, max_seq_len,
                                   bert_config.hidden_size)
    input_tensor = tf.convert_to_tensor(embed_output, dtype=tf_dtype)

    # keep attention_mask for compatible reason
    att_mask = np.tile(input_mask, max_seq_len)
    att_mask = att_mask.reshape(batch_size, max_seq_len, max_seq_len)
    attention_mask = tf.convert_to_tensor(att_mask, dtype=tf_dtype)

    # input info
    valid_word_num = sum(input_len)
    print("Valid word num : {}/{}, avg sequence length : {:.6} ".format(
        valid_word_num, batch_size * max_seq_len, valid_word_num / batch_size))

    # bert with standard transformer
    std_bert = modeling.transformer_model(
        input_tensor=input_tensor,
        attention_mask=attention_mask,
        hidden_size=bert_config.hidden_size,
        num_hidden_layers=bert_config.num_hidden_layers,
        num_attention_heads=bert_config.num_attention_heads,
        intermediate_size=bert_config.intermediate_size,
        intermediate_act_fn=modeling.get_activation(bert_config.hidden_act),
        hidden_dropout_prob=bert_config.hidden_dropout_prob,
        attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob,
        initializer_range=bert_config.initializer_range,
        do_return_all_layers=False)

    config = tf.ConfigProto()
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    with tf.Session(config=config) as sess:
        # init weights
        sess.run(tf.global_variables_initializer())

        # get transformer weights
        all_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        transformer_vars = [v for v in all_vars if v.name.startswith('layer')]
        weights_value = sess.run(transformer_vars)

        # bert with effective transformer
        et_bert = effective_transformer.get_sequence_output(
            max_batch_size=batch_size,
            max_seq_length=max_seq_len,
            config=bert_config,
            attention_mask=attention_mask,
            input_mask=input_mask_tensor,
            from_tensor=input_tensor,
            weights_value=weights_value,
        )

        # diff
        val1 = sess.run(std_bert).reshape(-1, 768)
        val2 = sess.run(et_bert).reshape(-1, 768)
        diff = []
        for b_idx, s_len in enumerate(input_len):
            for w_idx in range(s_len):
                idx = b_idx * args.max_seq_length + w_idx
                diff.append(np.fabs(val1[idx] - val2[idx]).max())
        print("max diff : {:.6}, avg diff : {:.6}.".format(
            max(diff),
            sum(diff) / len(diff)))

        def time_inference(output_tensor):
            iter_num = 128
            # warm up
            for i in range(10):
                sess.run(output_tensor)

            beg = datetime.now()
            for i in range(iter_num):
                sess.run(output_tensor)
            end = datetime.now()
            return (end - beg).total_seconds() * 1000 / iter_num  # ms

        print("xla cost : {:.6} ms".format(time_inference(std_bert)))
        print("et  cost : {:.6} ms".format(time_inference(et_bert)))
예제 #19
0
  return model_fn


def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
  """Get loss and log probs for the masked LM."""
  input_tensor = gather_indexes(input_tensor, positions)

  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[bert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    label_ids = tf.reshape(label_ids, [-1])
예제 #20
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     mix_number=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     groups=None,
                     expansion=None,
                     drop_rate=None,
                     gating_reduction=None,
                     **unused_params):
        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)

        config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
        config = copy.deepcopy(config)

        config.num_hidden_layers = FLAGS.bert_hidden_layer
        config.num_attention_heads = FLAGS.bert_attention_heads
        config.hidden_dropout_prob = FLAGS.bert_dropout_prob
        config.attention_probs_dropout_prob = FLAGS.bert_dropout_prob

        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        #breakpoint()
        with tf.variable_scope("encoder"):
            self.all_encoder_layers = modeling.transformer_model(
                input_tensor=model_input,
                attention_mask=None,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

        model_input = self.all_encoder_layers[-1]

        if FLAGS.sample_random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   FLAGS.iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     FLAGS.iterations)

        cluster_size = cluster_size or FLAGS.nextvlad_cluster_size
        hidden1_size = hidden_size or FLAGS.nextvlad_hidden_size
        gating_reduction = gating_reduction or FLAGS.gating_reduction
        groups = groups or FLAGS.groups
        drop_rate = drop_rate or FLAGS.drop_rate
        mix_number = mix_number or FLAGS.mix_number
        expansion = expansion or FLAGS.expansion

        max_frames = model_input.get_shape().as_list()[1]
        mask = tf.sequence_mask(FLAGS.iterations, max_frames, dtype=tf.float32)

        ftr_mean = tf.reduce_mean(model_input, axis=-1)
        ftr_mean = slim.batch_norm(ftr_mean,
                                   center=True,
                                   scale=True,
                                   fused=True,
                                   is_training=is_training,
                                   scope="mix_weights_bn")
        mix_weights = slim.fully_connected(
            ftr_mean,
            mix_number,
            activation_fn=None,
            weights_initializer=slim.variance_scaling_initializer(),
            scope="mix_weights")
        mix_weights = tf.nn.softmax(mix_weights, axis=-1)
        tf.summary.histogram("mix_weights", mix_weights)

        results = []
        for n in range(mix_number):
            with tf.variable_scope("branch_%d" % n):
                res = self.nextvlad_model(video_ftr=model_input[:, :, 0:1024],
                                          audio_ftr=model_input[:, :, 1024:],
                                          vocab_size=vocab_size,
                                          max_frames=max_frames,
                                          cluster_size=cluster_size,
                                          groups=groups,
                                          expansion=expansion,
                                          drop_rate=drop_rate,
                                          hidden1_size=hidden1_size,
                                          is_training=is_training,
                                          gating_reduction=gating_reduction,
                                          mask=mask,
                                          **unused_params)
                results.append(res)

        aux_preds = [res["predictions"] for res in results]
        logits = [res["logits"] for res in results]
        logits = tf.stack(logits, axis=1)

        mix_logit = tf.reduce_sum(tf.multiply(tf.expand_dims(mix_weights, -1),
                                              logits),
                                  axis=1)

        pred = tf.nn.sigmoid(mix_logit)

        if is_training:
            rank_pred = tf.expand_dims(tf.nn.softmax(tf.div(
                mix_logit, FLAGS.cl_temperature),
                                                     axis=-1),
                                       axis=1)
            aux_rank_preds = tf.nn.softmax(tf.div(logits,
                                                  FLAGS.cl_temperature),
                                           axis=-1)
            epsilon = 1e-8
            kl_loss = tf.reduce_sum(rank_pred *
                                    (tf.log(rank_pred + epsilon) -
                                     tf.log(aux_rank_preds + epsilon)),
                                    axis=-1)

            regularization_loss = FLAGS.cl_lambda * tf.reduce_mean(
                tf.reduce_sum(kl_loss, axis=-1), axis=-1)

            return {
                "predictions": pred,
                "regularization_loss": regularization_loss,
                "aux_predictions": aux_preds
            }
        else:
            return {"predictions": pred}
예제 #21
0
def get_masked_span_output(bert_config, input_tensor, input_mask,  positions, start_labels, end_labels, label_weights):
    """Get loss and log probs for the recurring span masking."""
    sequence_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]
    num_positions = modeling.get_shape_list(positions, expected_rank=2)[1]

    query_tensor = gather_indexes(input_tensor, positions)  # [batch_size * num_positions, width]

    with tf.variable_scope("cls/span_predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("query_start_transform"):
            query_start_tensor = tf.layers.dense(
                query_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            query_start_tensor = modeling.layer_norm(query_start_tensor)

        with tf.variable_scope("query_end_transform"):
            query_end_tensor = tf.layers.dense(
                query_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            query_end_tensor = modeling.layer_norm(query_end_tensor)

        with tf.variable_scope("start_transform"):
            start_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            start_tensor = modeling.layer_norm(start_tensor)

        with tf.variable_scope("end_transform"):
            end_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            end_tensor = modeling.layer_norm(end_tensor)

        start_classifier = tf.get_variable(
            "start_classifier",
            shape=[bert_config.hidden_size, bert_config.hidden_size],
            initializer=modeling.create_initializer(
                    bert_config.initializer_range))

        end_classifier = tf.get_variable(
            "end_classifier",
            shape=[bert_config.hidden_size, bert_config.hidden_size],
            initializer=modeling.create_initializer(
                bert_config.initializer_range))

        input_mask = tf.expand_dims(input_mask, axis=1)  # [batch_size, 1, seq_length]
        adder = (1.0 - tf.cast(input_mask, tf.float32)) * -10000.0

        temp = tf.matmul(query_start_tensor, start_classifier)  # [batch_size * num_positions, width]
        temp = tf.reshape(temp, [batch_size, num_positions, width])  # [batch_size, num_positions, width]
        start_tensor = tf.transpose(start_tensor, perm=[0, 2, 1])  # [batch_size, width, seq_length]
        start_logits = tf.matmul(temp, start_tensor)  # [batch_size, num_positions, seq_length]
        start_logits += adder
        start_logits = tf.reshape(start_logits, [batch_size * num_positions, seq_length])

        temp = tf.matmul(query_end_tensor, end_classifier)  # [batch_size * num_positions, width]
        temp = tf.reshape(temp, [batch_size, num_positions, width])  # [batch_size, num_positions, width]
        end_tensor = tf.transpose(end_tensor, perm=[0, 2, 1])  # [batch_size, width, seq_length]
        end_logits = tf.matmul(temp, end_tensor)  # [batch_size, num_positions, seq_length]
        end_logits += adder
        end_logits = tf.reshape(end_logits, [batch_size * num_positions, seq_length])

        label_weights = tf.reshape(label_weights, [-1])  # [batch_size * num_positions]

        start_log_probs = tf.nn.log_softmax(start_logits, axis=-1) #  [batch_size * num_positions, seq_length]
        start_labels = tf.reshape(start_labels, [-1])  # [batch_size * num_positions]
        start_one_hot_labels = tf.one_hot(
            start_labels, depth=seq_length, dtype=tf.float32)  # # [batch_size * num_positions, seq_length]

        start_per_example_loss = -tf.reduce_sum(start_log_probs * start_one_hot_labels, axis=[-1])

        end_log_probs = tf.nn.log_softmax(end_logits, axis=-1)  # [batch_size * num_positions, seq_length]
        end_labels = tf.reshape(end_labels, [-1])  # [batch_size * num_positions]
        end_one_hot_labels = tf.one_hot(
            end_labels, depth=seq_length, dtype=tf.float32)  # # [batch_size * num_positions, seq_length]

        end_per_example_loss = -tf.reduce_sum(end_log_probs * end_one_hot_labels, axis=[-1])

        per_example_loss = (start_per_example_loss + end_per_example_loss) / 2
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return loss, per_example_loss
예제 #22
0
def get_masked_lm_output(bert_config, input_tensor, output_weights,
                         output_type_weights, positions, label_ids,
                         masked_type_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)
    with tf.variable_scope("transform"):
        input_tensor = tf.layers.dense(
            input_tensor,
            units=bert_config.hidden_size,
            activation=modeling.get_activation(bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                bert_config.initializer_range))
        input_tensor = modeling.layer_norm(input_tensor)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.

        output_bias_type = tf.get_variable("output_bias_type",
                                           shape=[bert_config.vocab_type_size],
                                           initializer=tf.zeros_initializer())
        logits_type = tf.matmul(input_tensor,
                                output_type_weights,
                                transpose_b=True)
        logits_type = tf.nn.bias_add(logits_type, output_bias_type)
        log_probs_type = tf.nn.log_softmax(logits_type, axis=-1)

        type_label_ids = tf.reshape(masked_type_ids, [-1])
        type_label_weights = tf.reshape(label_weights, [-1])

        type_pre = tf.reshape(tf.argmax(log_probs_type, -1), [-1, 1])

        one_hot_type_labels = tf.one_hot(type_label_ids,
                                         depth=bert_config.vocab_type_size,
                                         dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        type_per_example_loss = -tf.reduce_sum(
            log_probs_type * one_hot_type_labels, axis=[-1])
        type_numerator = tf.reduce_sum(type_label_weights *
                                       type_per_example_loss)
        type_denominator = tf.reduce_sum(type_label_weights) + 1e-5
        type_loss = type_numerator / type_denominator

    (type_pre_embedding_output, _) = modeling.embedding_lookup(
        input_ids=type_pre,
        vocab_size=bert_config.vocab_type_size,
        embedding_size=bert_config.hidden_size,
        initializer_range=bert_config.initializer_range,
        word_embedding_name="type_word_embeddings",
        use_one_hot_embeddings=FLAGS.use_tpu,
        scope="bert/embeddings",
        reuse=True)

    with tf.variable_scope("cls/predictions/addtype"):
        # input_tensor = input_tensor + type_pre_embedding_output
        concat_input_tensor = tf.layers.dense(
            tf.concat([input_tensor,
                       tf.squeeze(type_pre_embedding_output)], -1),
            units=bert_config.hidden_size,
            activation=modeling.get_activation(bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                bert_config.initializer_range))
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(concat_input_tensor,
                           output_weights,
                           transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, type_loss, per_example_loss, log_probs)