Пример #1
0
    def __init__(self, config, input_embedding, input_mask=None):
        input_shape = modeling.get_shape_list(input_embedding, expected_rank=3)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        # Keep variable names the same as BERT
        with tf.variable_scope("bert"):
            with tf.variable_scope("encoder"):
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    input_embedding, input_mask)

                all_encoder_layers = modeling.transformer_model(
                    input_tensor=input_embedding,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

                self.sequence_output = all_encoder_layers[-1]
Пример #2
0
def get_dense_mask(needle_pos, segment_idx, dense_shape):
    # segment_idx is basically row idx.
    from_tensor = tf.ones(dense_shape)
    indices = tf.to_int64(tf.stack([segment_idx, needle_pos], axis=1))
    values = tf.fill(tf.shape(needle_pos), True)
    sp_tensor = tf.SparseTensor(indices, values, tf.to_int64(dense_shape))
    to_mask = sparse_tensor_to_dense(sp_tensor, False)
    return modeling.create_attention_mask_from_input_mask(from_tensor, to_mask)
Пример #3
0
    def build_bert_model(self, input_ids, input_mask, token_type_ids):
        with tf.variable_scope('bert'):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (embedding_output, _) = modeling.embedding_lookup(
                    input_ids=input_ids,
                    vocab_size=self.bert_config.vocab_size,
                    embedding_size=self.bert_config.hidden_size,
                    initializer_range=self.bert_config.initializer_range,
                    word_embedding_name="word_embeddings",
                    use_one_hot_embeddings=False)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                embedding_output = modeling.embedding_postprocessor(
                    input_tensor=embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=self.bert_config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=self.bert_config.initializer_range,
                    max_position_embeddings=self.bert_config.
                    max_position_embeddings,
                    dropout_prob=self.bert_config.hidden_dropout_prob)

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    input_ids, input_mask)

                # Run the stacked transformer, only fetching the final lyaer
                # `final_layer` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=self.bert_config.hidden_size,
                    num_hidden_layers=self.bert_config.num_hidden_layers,
                    num_attention_heads=self.bert_config.num_attention_heads,
                    intermediate_size=self.bert_config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                      self.bert_config.hidden_act),
                    hidden_dropout_prob=self.bert_config.hidden_dropout_prob,
                    attention_probs_dropout_prob=\
                      self.bert_config.attention_probs_dropout_prob,
                    initializer_range=self.bert_config.initializer_range,
                    do_return_all_layers=True
                )

            self.sequence_output = self.all_encoder_layers[-1]
Пример #4
0
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 image_embeddings,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=False,
                 scope=None):
        """Constructor for a visually grounded BertModel.

    Args:
      config: `BertConfig` instance.
      is_training: bool. true for training model, false for eval model. Controls
        whether dropout will be applied.
      input_ids: int32 Tensor of shape [batch_size, seq_length].
      image_embeddings: float32 Tensor of shape [batch_size, seq_length, depth].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.embedding_lookup() for the word embeddings.
      scope: (optional) variable scope. Defaults to "bert".
    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        text_input_shape = modeling.get_shape_list(input_ids, expected_rank=2)
        batch_size = text_input_shape[0]
        text_seq_length = text_input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, text_seq_length],
                                 dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, text_seq_length],
                                      dtype=tf.int32)

        with tf.variable_scope(scope, default_name="bert"):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (self.embedding_output,
                 self.embedding_table) = modeling.embedding_lookup(
                     input_ids=input_ids,
                     vocab_size=config.vocab_size,
                     embedding_size=config.hidden_size,
                     initializer_range=config.initializer_range,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

                # Add image embeddings the rest of the input embeddings.
                self.embedding_output += tf.layers.dense(
                    image_embeddings,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    self.embedding_output, input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))
def main(args):
    checkpoint_path = os.path.join(args.model_dir, "bert_model.ckpt")

    bert_config = BertConfig.from_json_file(
        os.path.join(args.model_dir, "bert_config.json"))
    bert_config.hidden_dropout_prob = 0.0
    bert_config.attention_probs_dropout_prob = 0.0

    batch_size = args.batch_size
    max_seq_len = args.max_seq_length
    tf_dtype = tf.float16 if args.precision == 'fp16' else tf.float32

    if args.effective_mode:
        # load transformer weights *before* building the computation graph
        weights_value = load_transformer_weights(checkpoint_path, bert_config,
                                                 batch_size, max_seq_len,
                                                 tf_dtype)

    # build model
    input_ids_placeholder = tf.placeholder(shape=[batch_size, max_seq_len],
                                           dtype=tf.int32,
                                           name="input_ids")
    input_mask_placeholder = tf.placeholder(shape=[batch_size, max_seq_len],
                                            dtype=tf.int32,
                                            name="input_mask")
    attention_mask_placeholder = tf.placeholder(
        shape=[batch_size, max_seq_len, max_seq_len],
        dtype=tf_dtype,
        name="attention_mask")
    input_embedding_placeholder = tf.placeholder(
        shape=[batch_size, max_seq_len, bert_config.hidden_size],
        dtype=tf_dtype,
        name="input_embedding")
    embedding_table_placeholder = tf.placeholder(
        shape=[bert_config.vocab_size, bert_config.hidden_size],
        dtype=tf_dtype,
        name="embedding_table")
    transformer_output_placeholder = tf.placeholder(
        shape=[batch_size, max_seq_len, bert_config.hidden_size],
        dtype=tf_dtype,
        name="transformer_output")

    embedding_layer = EmbeddingLayer(bert_config, input_ids_placeholder)
    if args.effective_mode:
        effective_transformer_layer = EffectiveTransformerLayer(
            batch_size, max_seq_len, bert_config, attention_mask_placeholder,
            input_mask_placeholder, input_embedding_placeholder, weights_value)
    else:
        standard_transformer_layer = TransformerLayer(
            bert_config, input_embedding_placeholder, input_mask_placeholder)
    output_layer = LanguageModelOutputLayer(bert_config,
                                            transformer_output_placeholder,
                                            embedding_table_placeholder)

    # model saver
    variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
    saver = tf.train.Saver(variables_to_restore)

    config = tf.ConfigProto()
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    with tf.Session(config=config) as sess:
        # restore embedding layer and output layer
        saver.restore(sess, checkpoint_path)

        # process input data
        tokenizer = FullTokenizer(
            vocab_file=os.path.join(args.model_dir, 'vocab.txt'))
        input_ids, input_mask, input_text, to_predict = process_data(
            batch_size, max_seq_len, tokenizer)
        input_ids_tensor = tf.convert_to_tensor(input_ids, dtype=tf.int32)
        input_mask_tensor = tf.convert_to_tensor(input_mask, dtype=tf.int32)

        # predict
        begin = datetime.now()
        input_embedding, embedding_table = sess.run(
            [
                embedding_layer.get_embedding_output(),
                embedding_layer.get_embedding_table()
            ],
            feed_dict={input_ids_placeholder: input_ids})
        attention_mask = sess.run(
            create_attention_mask_from_input_mask(input_ids_tensor,
                                                  input_mask_tensor))
        if args.effective_mode:
            transformer_output = sess.run(
                effective_transformer_layer.get_transformer_output(),
                feed_dict={
                    input_embedding_placeholder: input_embedding,
                    attention_mask_placeholder: attention_mask,
                    input_mask_placeholder: input_mask
                })
        else:
            transformer_output = sess.run(
                standard_transformer_layer.get_transformer_output(),
                feed_dict={
                    input_embedding_placeholder: input_embedding,
                    attention_mask_placeholder: attention_mask,
                    input_mask_placeholder: input_mask
                })
        probs = sess.run(output_layer.get_predict_probs(),
                         feed_dict={
                             transformer_output_placeholder:
                             transformer_output,
                             embedding_table_placeholder: embedding_table
                         })
        end = datetime.now()
        print("time cost: ", (end - begin).total_seconds(), "s")

        # choose top k answers
        k = 5
        top_ids = np.argsort(-probs, axis=2)[:, :, :k]

        batch_results = []
        for sid, blank_ids in enumerate(to_predict):
            sentence_results = []
            for cid in blank_ids:
                result = []
                for idx in top_ids[sid][cid]:
                    token = tokenizer.convert_ids_to_tokens([idx])[0]
                    result.append((token, probs[sid][cid][idx]))
                sentence_results.append(result)
            batch_results.append(sentence_results)

    for text, blank_ids, sentence_results in zip(input_text, to_predict,
                                                 batch_results):
        print("Q:", text)
        for cid, result in zip(blank_ids, sentence_results):
            print("A:", result)
Пример #6
0
    def __init__(self, config, is_training, input_tensor, input_mask,
                 token_type_ids):
        """Constructor for BertFlexEmbeddingModel.

    Args:
      config: `BertConfig` instance.
      is_training: bool. true for training model, false for eval model. Controls
        whether dropout will be applied.
      input_tensor: float32 Tensor of shape [batch_size, seq_length,
        hidden_size].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].

    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        with tf.variable_scope("bert", reuse=tf.compat.v1.AUTO_REUSE):
            with tf.variable_scope("embeddings"):
                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=input_tensor,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    input_tensor, input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))
Пример #7
0
    def __init__(self,
                 config,
                 features,
                 _rank_size,
                 trainable=True,
                 scope="train",
                 batch_size=None,
                 training=True):
        hidden_size = 128

        list_vec = features['features']  # [B, seq_len, dense_dim]
        reshape_list_vec = tf.reshape(list_vec, [-1, 700])

        seq_len = tf.string_to_number(features['seq_len'], out_type=tf.int32)
        seq_mask = tf.sequence_mask(seq_len,
                                    utils.seq_max_len(config, 'features'))
        attention_mask = modeling.create_attention_mask_from_input_mask(
            list_vec, seq_mask)

        def build_net(_reshape_list_vec, need_pos=True):
            pos_embedding = tf.get_variable(
                shape=[_rank_size, hidden_size],
                dtype=tf.float32,
                initializer=tf.initializers.truncated_normal(mean=0.0,
                                                             stddev=0.01),
                trainable=True,
                name="pos_embedding")

            hidden_reshape_list_vec = tf.layers.dense(reshape_list_vec,
                                                      hidden_size)

            hidden_list_vec = tf.reshape(hidden_reshape_list_vec,
                                         [-1, 20, hidden_size])
            if need_pos:
                hidden_list_vec = hidden_list_vec + pos_embedding

            # Run the stacked transformer.
            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
            outputs = modeling.transformer_model(
                input_tensor=hidden_list_vec,
                hidden_size=hidden_size,
                num_hidden_layers=2,
                num_attention_heads=1,
                intermediate_size=hidden_size * 4,
                do_return_all_layers=False)

            outputs = tf.squeeze(tf.layers.dense(outputs, 1, activation=None),
                                 axis=-1)

            return outputs

        with tf.variable_scope('ctr'):
            ctr_outputs = build_net(reshape_list_vec, need_pos=False)

        with tf.variable_scope('pbr'):
            pbr_outputs = build_net(reshape_list_vec, need_pos=True)

        self._ctr_prediction = tf.nn.sigmoid(ctr_outputs)
        self._pbr_prediction = tf.nn.sigmoid(pbr_outputs)

        if training:
            with tf.name_scope('auc'):
                self._auc = {}

                if 'clicks' in features:
                    label = tf.string_to_number(features['clicks'],
                                                out_type=tf.float32)
                    bounce = tf.string_to_number(features['bounces'],
                                                 out_type=tf.float32)
                    seq_len = tf.string_to_number(features['seq_len'],
                                                  out_type=tf.int32)
                    weights = tf.sequence_mask(seq_len,
                                               utils.seq_max_len(
                                                   config, 'features'),
                                               dtype=tf.int32)
                    self._auc['ctr'] = tf.metrics.auc(label,
                                                      self._ctr_prediction,
                                                      weights=weights)
                    self._auc['pbr'] = tf.metrics.auc(bounce,
                                                      self._pbr_prediction,
                                                      weights=weights)

            with tf.name_scope('loss'):
                clicks = tf.string_to_number(features['clicks'],
                                             out_type=tf.float32)
                bounce = tf.string_to_number(features['bounces'],
                                             out_type=tf.float32)
                seq_len = tf.string_to_number(features['seq_len'],
                                              out_type=tf.int32)
                weights = tf.sequence_mask(seq_len,
                                           utils.seq_max_len(
                                               config, 'features'),
                                           dtype=tf.int32)
                ctr_loss = tf.losses.log_loss(clicks, self._ctr_prediction,
                                              weights)
                pbr_loss = tf.losses.log_loss(bounce, self._pbr_prediction,
                                              weights)
                self._loss = ctr_loss + pbr_loss
Пример #8
0
  def build_attn_layer(self,
                       input_tensor,
                       attn_mask_concat,
                       layer_attn_type,
                       num_attention_heads=1,
                       size_per_head=512,
                       attention_probs_dropout_prob=0.1,
                       initializer_range=0.02,
                       do_return_2d_tensor=False):
    # TODO (May 5): To capture each softmax output, will need a modified
    #  `attention_layer`
    input_tensor_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_tensor_shape[0]
    total_seq_length = input_tensor_shape[1]
    arg_seq_length = int(total_seq_length / 2)

    attention_head = None
    if layer_attn_type == "self":
      attn_mask = modeling.create_attention_mask_from_input_mask(
        input_tensor, attn_mask_concat)
      attention_head = modeling.attention_layer(
        from_tensor=input_tensor,
        to_tensor=input_tensor,
        attention_mask=attn_mask,
        num_attention_heads=num_attention_heads,
        size_per_head=size_per_head,
        attention_probs_dropout_prob=attention_probs_dropout_prob,
        initializer_range=initializer_range,
        do_return_2d_tensor=do_return_2d_tensor,
        batch_size=batch_size,
        from_seq_length=total_seq_length,
        to_seq_length=total_seq_length
      )

    else:
      arg1 = input_tensor[:, :arg_seq_length, :]
      arg2 = input_tensor[:, arg_seq_length:, :]

      arg1_attn_mask = attn_mask_concat[:, :arg_seq_length]
      arg1_attn_mask = modeling.create_attention_mask_from_input_mask(
        arg1, arg1_attn_mask)

      arg2_attn_mask = attn_mask_concat[:, arg_seq_length:]
      arg2_attn_mask = modeling.create_attention_mask_from_input_mask(
        arg2, arg2_attn_mask)

      if layer_attn_type == "inter":
        with tf.variable_scope("arg1_arg2"):
          arg1_to_arg2 = modeling.attention_layer(
            from_tensor=arg1,
            to_tensor=arg2,
            attention_mask=arg2_attn_mask,
            num_attention_heads=num_attention_heads,
            size_per_head=size_per_head,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            initializer_range=initializer_range,
            do_return_2d_tensor=do_return_2d_tensor,
            batch_size=batch_size,
            from_seq_length=arg_seq_length,
            to_seq_length=arg_seq_length
          )

        with tf.variable_scope("arg2_arg1"):
          arg2_to_arg1 = modeling.attention_layer(
            from_tensor=arg2,
            to_tensor=arg1,
            attention_mask=arg1_attn_mask,
            num_attention_heads=num_attention_heads,
            size_per_head=size_per_head,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            initializer_range=initializer_range,
            do_return_2d_tensor=do_return_2d_tensor,
            batch_size=batch_size,
            from_seq_length=arg_seq_length,
            to_seq_length=arg_seq_length
          )

        attention_head = tf.concat([arg1_to_arg2, arg2_to_arg1], axis=1)
      else:
        with tf.variable_scope("arg1_arg1"):
          arg1_to_arg1 = modeling.attention_layer(
            from_tensor=arg1,
            to_tensor=arg1,
            attention_mask=arg1_attn_mask,
            num_attention_heads=num_attention_heads,
            size_per_head=size_per_head,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            initializer_range=initializer_range,
            do_return_2d_tensor=do_return_2d_tensor,
            batch_size=batch_size,
            from_seq_length=arg_seq_length,
            to_seq_length=arg_seq_length
          )

        with tf.variable_scope("arg2_arg2"):
          arg2_to_arg2 = modeling.attention_layer(
            from_tensor=arg2,
            to_tensor=arg2,
            attention_mask=arg2_attn_mask,
            num_attention_heads=num_attention_heads,
            size_per_head=size_per_head,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            initializer_range=initializer_range,
            do_return_2d_tensor=do_return_2d_tensor,
            batch_size=batch_size,
            from_seq_length=arg_seq_length,
            to_seq_length=arg_seq_length
          )

        attention_head = tf.concat([arg1_to_arg1, arg2_to_arg2], axis=1)

    return attention_head
Пример #9
0
def create_mask_model(bert_config, is_training, input_ids, input_mask,
                      segment_ids, mask_positions, use_one_hot_embeddings):
    """Creates a classification model."""

    #print("create mask model ----------------------------------------------")
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # Get the logits for the start and end predictions.
    final_hidden = model.get_sequence_output()

    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    output_weights = tf.get_variable(
        "cls/nq/output_weights", [2, hidden_size + 12],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("cls/nq/output_bias", [2],
                                  initializer=tf.zeros_initializer())

    final_hidden_matrix = tf.reshape(final_hidden,
                                     [batch_size * seq_length, hidden_size])
    mask_positions_matrix = tf.cast(tf.reshape(mask_positions,
                                               [batch_size * seq_length, 1]),
                                    dtype=tf.float32)
    padding = tf.zeros([batch_size * seq_length, 11], dtype=tf.float32)
    mask_positions_matrix = tf.concat([mask_positions_matrix, padding],
                                      axis=-1)
    final_hidden_matrix = tf.concat(
        [final_hidden_matrix, mask_positions_matrix], axis=-1)
    final_hidden_matrix = tf.reshape(
        final_hidden_matrix, [batch_size, seq_length, hidden_size + 12])
    attention_mask = modeling.create_attention_mask_from_input_mask(
        input_ids, input_mask)
    config = bert_config
    all_encoder_layers = modeling.transformer_model(
        input_tensor=final_hidden_matrix,
        attention_mask=attention_mask,
        hidden_size=config.hidden_size + 12,  # input hidden size
        num_hidden_layers=1,  #config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        intermediate_act_fn=modeling.get_activation(config.hidden_act),
        hidden_dropout_prob=config.hidden_dropout_prob,
        attention_probs_dropout_prob=config.attention_probs_dropout_prob,
        initializer_range=config.initializer_range,
        do_return_all_layers=True)
    #print(all_encoder_layers.shape)
    transformer_output_matrix = all_encoder_layers[-1]

    transformer_output_matrix = tf.reshape(
        transformer_output_matrix, [batch_size * seq_length, hidden_size + 12])
    logits = tf.matmul(transformer_output_matrix,
                       output_weights,
                       transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    logits = tf.reshape(logits, [batch_size, seq_length, 2])
    logits = tf.transpose(logits, [2, 0, 1])

    unstacked_logits = tf.unstack(logits, axis=0)

    (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

    # Get the logits for the answer type prediction.
    answer_type_output_layer = model.get_pooled_output()
    answer_type_hidden_size = answer_type_output_layer.shape[-1].value

    num_answer_types = 5  # YES, NO, UNKNOWN, SHORT, LONG
    answer_type_output_weights = tf.get_variable(
        "answer_type_output_weights",
        [num_answer_types, answer_type_hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    answer_type_output_bias = tf.get_variable(
        "answer_type_output_bias", [num_answer_types],
        initializer=tf.zeros_initializer())

    answer_type_logits = tf.matmul(answer_type_output_layer,
                                   answer_type_output_weights,
                                   transpose_b=True)
    answer_type_logits = tf.nn.bias_add(answer_type_logits,
                                        answer_type_output_bias)

    return (start_logits, end_logits, answer_type_logits)
Пример #10
0
    def __init__(self,
                 config,
                 use_one_hot_embeddings=True,
                 num_labels=2,
                 max_seq_length=128):
        """Constructor for BertModel.

        Args:
          config: `BertConfig` instance.
          is_training: bool. true for training model, false for eval model. Controls
            whether dropout will be applied.
          input_ids: int32 Tensor of shape [batch_size, seq_length].
          input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
          token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
          use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
            embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
            it is much faster if this is True, on the CPU or GPU, it is faster if
            this is False.
          scope: (optional) variable scope. Defaults to "bert".

        Raises:
          ValueError: The config is invalid or one of the input tensor shapes
            is invalid.
        """
        self.input_ids = tf.placeholder(dtype=tf.int32,
                                        shape=(None, max_seq_length))
        self.input_mask = tf.placeholder(dtype=tf.int8,
                                         shape=(None, max_seq_length))

        config = copy.deepcopy(config)

        input_shape = modeling.get_shape_list(self.input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                  dtype=tf.int32)

        with tf.variable_scope("bert", reuse=tf.AUTO_REUSE):
            with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE):
                # Perform embedding lookup on the word ids.
                (self.embedding_output,
                 self.embedding_table) = modeling.embedding_lookup(
                     input_ids=self.input_ids,
                     vocab_size=config.vocab_size,
                     embedding_size=config.hidden_size,
                     initializer_range=config.initializer_range,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

            with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    self.input_ids, self.input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler", reuse=tf.AUTO_REUSE):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))

        # define output_weights and output_bias
        hidden_size = self.pooled_output.shape[-1].value
        with tf.variable_scope("", reuse=tf.AUTO_REUSE):
            self.output_weights = tf.get_variable(
                "output_weights", [num_labels, hidden_size],
                initializer=tf.truncated_normal_initializer(stddev=0.02))
            self.output_bias = tf.get_variable(
                "output_bias", [num_labels],
                initializer=tf.zeros_initializer())
Пример #11
0
def create_bilstm_classification_model(bert_config,
                                       is_training,
                                       response_input_ids,
                                       response_input_mask,
                                       response_segment_ids,
                                       response_text_len,
                                       response_labels,
                                       random_forward_input_ids,
                                       random_forward_input_mask,
                                       random_forward_segment_ids,
                                       random_forward_text_len,
                                       random_backward_input_ids,
                                       random_backward_input_mask,
                                       random_backward_segment_ids,
                                       random_backward_text_len,
                                       random_labels,
                                       swap_forward_input_ids,
                                       swap_forward_input_mask,
                                       swap_forward_segment_ids,
                                       swap_forward_text_len,
                                       swap_backward_input_ids,
                                       swap_backward_input_mask,
                                       swap_backward_segment_ids,
                                       swap_backward_text_len,
                                       swap_labels,
                                       nli_forward_input_ids,
                                       nli_forward_input_mask,
                                       nli_forward_segment_ids,
                                       nli_forward_text_len,
                                       nli_backward_input_ids,
                                       nli_backward_input_mask,
                                       nli_backward_segment_ids,
                                       nli_backward_text_len,
                                       nli_labels,
                                       num_nli_labels,
                                       use_one_hot_embeddings,
                                       l2_reg_lambda=0.1,
                                       dropout_rate=1.0,
                                       lstm_size=None,
                                       num_layers=1):

    config = copy.deepcopy(bert_config)

    if not is_training:
        config.hidden_dropout_prob = 0.0
        config.attention_probs_dropout_prob = 0.0

    with tf.variable_scope("bert", reuse=tf.AUTO_REUSE):

        with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE):
            (response_embedding_output,
             response_embedding_table) = modeling.embedding_lookup(
                 input_ids=response_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            response_embedding_output = modeling.embedding_postprocessor(
                input_tensor=response_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=response_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            # random detection
            # Perform embedding lookup on the word ids.
            (random_foward_embedding_output,
             random_forward_embedding_table) = modeling.embedding_lookup(
                 input_ids=random_forward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            # Perform embedding lookup on the word ids.
            (random_backward_embedding_output,
             random_backward_embedding_table) = modeling.embedding_lookup(
                 input_ids=random_backward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            # Add positional embeddings and token type embeddings, then layer
            # normalize and perform dropout.
            random_foward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=random_foward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=random_forward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            random_backward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=random_backward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=random_backward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            # swap detection
            (swap_foward_embedding_output,
             swap_forward_embedding_table) = modeling.embedding_lookup(
                 input_ids=swap_forward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            (swap_backward_embedding_output,
             swap_backward_embedding_table) = modeling.embedding_lookup(
                 input_ids=swap_backward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)
            swap_foward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=swap_foward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=swap_forward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)
            swap_backward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=swap_backward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=swap_backward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            # # generic detection
            # (generic_foward_embedding_output, generic_forward_embedding_table) = modeling.embedding_lookup(
            #     input_ids=generic_forward_input_ids,
            #     vocab_size=config.vocab_size,
            #     embedding_size=config.hidden_size,
            #     initializer_range=config.initializer_range,
            #     word_embedding_name="word_embeddings",
            #     use_one_hot_embeddings=use_one_hot_embeddings)
            # (generic_backward_embedding_output, generic_backward_embedding_table) = modeling.embedding_lookup(
            #     input_ids=generic_backward_input_ids,
            #     vocab_size=config.vocab_size,
            #     embedding_size=config.hidden_size,
            #     initializer_range=config.initializer_range,
            #     word_embedding_name="word_embeddings",
            #     use_one_hot_embeddings=use_one_hot_embeddings)
            # generic_foward_embedding_output = modeling.embedding_postprocessor(
            #     input_tensor=generic_foward_embedding_output,
            #     use_token_type=not config.roberta,
            #     token_type_ids=generic_forward_segment_ids,
            #     token_type_vocab_size=config.type_vocab_size,
            #     token_type_embedding_name="token_type_embeddings",
            #     use_position_embeddings=True,
            #     position_embedding_name="position_embeddings",
            #     initializer_range=config.initializer_range,
            #     max_position_embeddings=config.max_position_embeddings,
            #     dropout_prob=config.hidden_dropout_prob,
            #     roberta=config.roberta)
            # generic_backward_embedding_output = modeling.embedding_postprocessor(
            #     input_tensor=generic_backward_embedding_output,
            #     use_token_type=not config.roberta,
            #     token_type_ids=generic_backward_segment_ids,
            #     token_type_vocab_size=config.type_vocab_size,
            #     token_type_embedding_name="token_type_embeddings",
            #     use_position_embeddings=True,
            #     position_embedding_name="position_embeddings",
            #     initializer_range=config.initializer_range,
            #     max_position_embeddings=config.max_position_embeddings,
            #     dropout_prob=config.hidden_dropout_prob,
            #     roberta=config.roberta)

            # nli detection
            (nli_foward_embedding_output,
             nli_forward_embedding_table) = modeling.embedding_lookup(
                 input_ids=nli_forward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)
            (nli_backward_embedding_output,
             nli_backward_embedding_table) = modeling.embedding_lookup(
                 input_ids=nli_backward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)
            nli_foward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=nli_foward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=nli_forward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)
            nli_backward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=nli_backward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=nli_backward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            response_attention_mask = modeling.create_attention_mask_from_input_mask(
                response_input_ids, response_input_mask)
            # [batch_size, from_seq_length, to_seq_length]
            # mask future tokens
            diag_vals = tf.ones_like(response_attention_mask[0, :, :])
            tril = tf.linalg.LinearOperatorLowerTriangular(
                diag_vals).to_dense()
            future_masks = tf.tile(tf.expand_dims(
                tril, 0), [tf.shape(response_attention_mask)[0], 1, 1])
            response_attention_mask = tf.math.multiply(response_attention_mask,
                                                       future_masks)
            # Run the stacked transformer.
            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
            response_all_encoder_layers = modeling.transformer_model(
                input_tensor=response_embedding_output,
                attention_mask=response_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            # random detection
            # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
            # mask of shape [batch_size, seq_length, seq_length] which is used
            # for the attention scores.
            random_forward_attention_mask = modeling.create_attention_mask_from_input_mask(
                random_forward_input_ids, random_forward_input_mask)
            random_backward_attention_mask = modeling.create_attention_mask_from_input_mask(
                random_backward_input_ids, random_backward_input_mask)
            # Run the stacked transformer.
            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
            random_forward_all_encoder_layers = modeling.transformer_model(
                input_tensor=random_foward_embedding_output,
                attention_mask=random_forward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
            random_backward_all_encoder_layers = modeling.transformer_model(
                input_tensor=random_backward_embedding_output,
                attention_mask=random_backward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            # swap detection
            swap_forward_attention_mask = modeling.create_attention_mask_from_input_mask(
                swap_forward_input_ids, swap_forward_input_mask)
            swap_backward_attention_mask = modeling.create_attention_mask_from_input_mask(
                swap_backward_input_ids, swap_backward_input_mask)
            swap_forward_all_encoder_layers = modeling.transformer_model(
                input_tensor=swap_foward_embedding_output,
                attention_mask=swap_forward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
            swap_backward_all_encoder_layers = modeling.transformer_model(
                input_tensor=swap_backward_embedding_output,
                attention_mask=swap_backward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            # # generic detection
            # generic_forward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_forward_input_ids,
            #                                                                                 generic_forward_input_mask)
            # generic_backward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_backward_input_ids,
            #                                                                                  generic_backward_input_mask)
            # generic_forward_all_encoder_layers = modeling.transformer_model(
            #     input_tensor=generic_foward_embedding_output,
            #     attention_mask=generic_forward_attention_mask,
            #     hidden_size=config.hidden_size,
            #     num_hidden_layers=config.num_hidden_layers,
            #     num_attention_heads=config.num_attention_heads,
            #     intermediate_size=config.intermediate_size,
            #     intermediate_act_fn=modeling.get_activation(config.hidden_act),
            #     hidden_dropout_prob=config.hidden_dropout_prob,
            #     attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            #     initializer_range=config.initializer_range,
            #     do_return_all_layers=True)
            # generic_backward_all_encoder_layers = modeling.transformer_model(
            #     input_tensor=generic_backward_embedding_output,
            #     attention_mask=generic_backward_attention_mask,
            #     hidden_size=config.hidden_size,
            #     num_hidden_layers=config.num_hidden_layers,
            #     num_attention_heads=config.num_attention_heads,
            #     intermediate_size=config.intermediate_size,
            #     intermediate_act_fn=modeling.get_activation(config.hidden_act),
            #     hidden_dropout_prob=config.hidden_dropout_prob,
            #     attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            #     initializer_range=config.initializer_range,
            #     do_return_all_layers=True)

            # nli detection
            nli_forward_attention_mask = modeling.create_attention_mask_from_input_mask(
                nli_forward_input_ids, nli_forward_input_mask)
            nli_backward_attention_mask = modeling.create_attention_mask_from_input_mask(
                nli_backward_input_ids, nli_backward_input_mask)
            nli_forward_all_encoder_layers = modeling.transformer_model(
                input_tensor=nli_foward_embedding_output,
                attention_mask=nli_forward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
            nli_backward_all_encoder_layers = modeling.transformer_model(
                input_tensor=nli_backward_embedding_output,
                attention_mask=nli_backward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

        random_forward_embedding = random_forward_all_encoder_layers[-2]
        random_backward_embedding = random_backward_all_encoder_layers[-2]
        swap_forward_embedding = swap_forward_all_encoder_layers[-2]
        swap_backward_embedding = swap_backward_all_encoder_layers[-2]
        # generic_forward_embedding = generic_forward_all_encoder_layers[-2]
        # generic_backward_embedding = generic_backward_all_encoder_layers[-2]
        nli_forward_embedding = nli_forward_all_encoder_layers[-2]
        nli_backward_embedding = nli_backward_all_encoder_layers[-2]
        response_embedding = response_all_encoder_layers[-2]

    response_embedding_shape = modeling.get_shape_list(response_embedding,
                                                       expected_rank=3)
    with tf.variable_scope("lm_head", reuse=tf.AUTO_REUSE):

        response_logits = tf.layers.dense(response_embedding,
                                          config.hidden_size,
                                          activation=None)
        response_logits = modeling.gelu(response_logits)
        response_logits = modeling.layer_norm(response_logits)
        response_outputs = tf.layers.dense(
            response_logits,
            config.vocab_size,
            activation=None,
            use_bias=True,
            bias_initializer=tf.zeros_initializer())

        response_one_hot = tf.one_hot(response_labels,
                                      depth=config.vocab_size,
                                      dtype=tf.float32)

        lm_cost = tf.nn.softmax_cross_entropy_with_logits(
            labels=response_one_hot, logits=response_outputs)

        sequence_mask = tf.sequence_mask(response_text_len,
                                         maxlen=response_embedding_shape[1],
                                         dtype=tf.float32)

        masked_lm_cost = tf.math.multiply(lm_cost, sequence_mask)

        final_lm_loss = tf.reduce_mean(
            tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1),
                           tf.cast(response_text_len, dtype=tf.float32)))

        perplexity = tf.exp(
            tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1),
                           tf.cast(response_text_len, dtype=tf.float32)))

    random_forward_embedding_shape = modeling.get_shape_list(
        random_forward_embedding, expected_rank=3)
    random_backward_embedding_shape = modeling.get_shape_list(
        random_backward_embedding, expected_rank=3)
    assert random_forward_embedding_shape[
        2] == random_backward_embedding_shape[2]
    random_forward_embedding = tf.transpose(random_forward_embedding,
                                            [1, 0, 2])
    random_backward_embedding = tf.transpose(random_backward_embedding,
                                             [1, 0, 2])
    random_forward_input_mask = tf.cast(
        tf.transpose(random_forward_input_mask, [1, 0]), tf.float32)
    random_backward_input_mask = tf.cast(
        tf.transpose(random_backward_input_mask, [1, 0]), tf.float32)

    swap_forward_embedding_shape = modeling.get_shape_list(
        swap_forward_embedding, expected_rank=3)
    swap_backward_embedding_shape = modeling.get_shape_list(
        swap_backward_embedding, expected_rank=3)
    assert swap_forward_embedding_shape[2] == swap_backward_embedding_shape[2]
    swap_forward_embedding = tf.transpose(swap_forward_embedding, [1, 0, 2])
    swap_backward_embedding = tf.transpose(swap_backward_embedding, [1, 0, 2])
    swap_forward_input_mask = tf.cast(
        tf.transpose(swap_forward_input_mask, [1, 0]), tf.float32)
    swap_backward_input_mask = tf.cast(
        tf.transpose(swap_backward_input_mask, [1, 0]), tf.float32)

    # generic_forward_embedding_shape = modeling.get_shape_list(generic_forward_embedding, expected_rank=3)
    # generic_backward_embedding_shape = modeling.get_shape_list(generic_backward_embedding, expected_rank=3)
    # assert generic_forward_embedding_shape[2] == generic_backward_embedding_shape[2]
    # generic_forward_embedding = tf.transpose(generic_forward_embedding, [1, 0, 2])
    # generic_backward_embedding = tf.transpose(generic_backward_embedding, [1, 0, 2])
    # generic_forward_input_mask = tf.cast(tf.transpose(generic_forward_input_mask, [1, 0]), tf.float32)
    # generic_backward_input_mask = tf.cast(tf.transpose(generic_backward_input_mask, [1, 0]), tf.float32)

    nli_forward_embedding_shape = modeling.get_shape_list(
        nli_forward_embedding, expected_rank=3)
    nli_backward_embedding_shape = modeling.get_shape_list(
        nli_backward_embedding, expected_rank=3)
    assert nli_forward_embedding_shape[2] == nli_backward_embedding_shape[2]
    nli_forward_embedding = tf.transpose(nli_forward_embedding, [1, 0, 2])
    nli_backward_embedding = tf.transpose(nli_backward_embedding, [1, 0, 2])
    nli_forward_input_mask = tf.cast(
        tf.transpose(nli_forward_input_mask, [1, 0]), tf.float32)
    nli_backward_input_mask = tf.cast(
        tf.transpose(nli_backward_input_mask, [1, 0]), tf.float32)

    model = HadeModel(
        x_random_forward=random_forward_embedding,
        x_random_mask_forward=random_forward_input_mask,
        x_random_length_forward=random_forward_text_len,
        x_random_backward=random_backward_embedding,
        x_random_mask_backward=random_backward_input_mask,
        x_random_length_backward=random_backward_text_len,
        y_random=random_labels,
        x_swap_forward=swap_forward_embedding,
        x_swap_mask_forward=swap_forward_input_mask,
        x_swap_length_forward=swap_forward_text_len,
        x_swap_backward=swap_backward_embedding,
        x_swap_mask_backward=swap_backward_input_mask,
        x_swap_length_backward=swap_backward_text_len,
        y_swap=swap_labels,
        # x_generic_forward=generic_forward_embedding,
        # x_generic_mask_forward=generic_forward_input_mask,
        # x_generic_length_forward=generic_forward_text_len,
        # x_generic_backward=generic_backward_embedding,
        # x_generic_mask_backward=generic_backward_input_mask,
        # x_generic_length_backward=generic_backward_text_len, y_generic=generic_labels,
        x_nli_forward=nli_forward_embedding,
        x_nli_mask_forward=nli_forward_input_mask,
        x_nli_length_forward=nli_forward_text_len,
        x_nli_backward=nli_backward_embedding,
        x_nli_mask_backward=nli_backward_input_mask,
        x_nli_length_backward=nli_backward_text_len,
        y_nli=nli_labels,
        embedding_dim=random_forward_embedding_shape[2],
        num_nli_labels=num_nli_labels,
        hidden_size=lstm_size,
        l2_reg_lambda=l2_reg_lambda,
        num_layers=num_layers,
        dropout_rate=dropout_rate,
        is_training=is_training)

    random_prob, swap_prob, nli_prob, total_cost = model.create_model()

    return random_prob, swap_prob, nli_prob, total_cost, final_lm_loss, perplexity