示例#1
0
 def classify_domain(self, inputs):
     output_prev = tf.layers.dense(
         inputs,
         self.hidden_size,
         activation=tf.nn.relu,
         name='domain_layer_prev',
         kernel_initializer=_mh.create_initializer(
             initializer_range=self.initializer_range))
     output = tf.layers.dense(output_prev,
                              self.num_domains,
                              activation=None,
                              name='domain_layer_final',
                              kernel_initializer=_mh.create_initializer(
                                  initializer_range=self.initializer_range))
     return output
示例#2
0
def textCNN(embedding,
            seq_length,
            window_size,
            pool_size,
            filter_number,
            hidden_size,
            dropout_prob,
            initializer_range,
            scope=None):
    """Apply textCNN on the embeddings.
    The code here is revised from the below url:
      https://github.com/dennybritz/cnn-text-classification-tf/blob/master/text_cnn.py
    Double Salute !
  """
    embedding_shape = _mh.get_shape_list(embedding)
    seq_length = embedding_shape[1]
    embedding_size = embedding_shape[2]
    embedded_expanded = tf.expand_dims(embedding, -1)

    pooled_outputs = []
    for i, ws in enumerate(window_size):
        with tf.variable_scope(scope, default_name='conv_{}'.format(i)):
            # Conv
            filter_shape = [ws, embedding_size, 1, filter_number]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1),
                            name="W")
            b = tf.Variable(tf.constant(0.1, shape=[filter_number]), name="b")
            conv = tf.nn.conv2d(embedded_expanded,
                                W,
                                strides=[1, 1, 1, 1],
                                padding="VALID",
                                name="conv")
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
            # MaxPool
            pooled = tf.nn.max_pool(h,
                                    ksize=[1, pool_size[i], 1, 1],
                                    strides=[1, 1, 1, 1],
                                    padding='VALID',
                                    name="pool")
            pooled_outputs.append(pooled)

    # Combine all the pooled features
    num_filters_total = filter_number * len(window_size)
    h_pool = tf.concat(pooled_outputs, 3)
    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

    # Add dropout
    with tf.name_scope("dropout"):
        h_drop = tf.nn.dropout(h_pool_flat, keep_prob=(1 - dropout_prob))

    # Final Output
    with tf.variable_scope('textCNN_output'):
        output = tf.layers.dense(h_drop,
                                 hidden_size,
                                 activation=tf.nn.relu,
                                 name='layer_output',
                                 kernel_initializer=_mh.create_initializer(
                                     initializer_range=initializer_range))

    return output
示例#3
0
 def classify_layer(self, inputs):
     """Classify the input as class according to the number of classes."""
     output = tf.layers.dense(inputs,
                              self.num_classes,
                              activation=None,
                              name='label_layer',
                              kernel_initializer=_mh.create_initializer(
                                  initializer_range=self.initializer_range))
     return output
示例#4
0
    def build(self, input_text, input_image, scope=None):
        """"Build the whole graph."""
        with tf.variable_scope(scope, default_name='EANN'):
            # Embedding
            with tf.variable_scope('embeddings'):
                embedding_output, self.embedding_table = _mh.embedding_lookup(
                    input_ids=input_text,
                    vocab_size=self.vocab_size,
                    embedding_size=self.embedding_size,
                    initializer_range=self.initializer_range,
                    word_embedding_name='word_embeddings')

            # textCNN -> [batch_size, hidden_size]
            with tf.variable_scope('textCNN'):
                text_output = textCNN(embedding_output, self.seq_length,
                                      self.window_size, self.pool_size,
                                      self.filter_number_text,
                                      self.hidden_size, self.dropout,
                                      self.initializer_range)
            # VGG_19
            with tf.variable_scope('vgg_19'):
                image_output = self.vgg(input_image)
                # image_output.pretrained()
                batch_size = _mh.get_shape_list(image_output)[0]
                # squeeze the tensor, as the following dense layer need specified last dimension,
                # must specify the exact dimension
                image_output = tf.reshape(image_output, (batch_size, 25088))
                image_output = tf.layers.dense(
                    image_output,
                    self.hidden_size,
                    activation=None,
                    name='image_output_layer',
                    kernel_initializer=_mh.create_initializer(
                        initializer_range=self.initializer_range))

            # concatenate the text output with the image output
            text_image_output = tf.concat((text_output, image_output), -1)

            # label classify layer
            with tf.variable_scope('classify_label'):
                label_output = self.classify_layer(text_image_output)
            # domain classify layer
            with tf.variable_scope('classify_domain'):
                # apply reversal gradient here
                reverse_text_image_output = flip_gradient(text_image_output)
                domain_output = self.classify_domain(reverse_text_image_output)

        return label_output, domain_output, batch_size
示例#5
0
    def __init__(self, 
                 config, 
                 is_training, 
                 input_ids, 
                 input_mask=None, 
                 token_type_ids=None, 
                 pre_positional_embeddings=None,
                 use_one_hot_embeddings=False, 
                 scope=None):
        """"Constructor for ALBert.
        
        Args:
            config: # TODO
            is_training: bool. If True, enable dropout, else disable dropout.
            input_ids: int32 Tensor of shape [batch_size, seq_length].
            input_mask: (optional) int32 Tensor, 
                this is the mask for point the padding indices, [batch_size, seq_length].
                ATTENTION: for the UniLM model, the input_mask is shape as [seq_length, seq_length],
                    see more in the `create_mask_for_lm` in load_data.py.
            token_type_ids: (optional) int32 Tensor, point the words belonging to different segments, 
                [batch_size,seq_length].
            use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings 
                or tf.embedding_lookup() for the word embeddings.
        """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = _mh.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            # each word is the real word, no padding.
            input_shape = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
        
        with tf.variable_scope(scope, default_name='bert'):
            # Embedding
            with tf.variable_scope('embeddings'):
                # 1. obtain embeddings
                self.embedding_output, self.embedding_table, self.projection_table = _mh.embedding_lookup_factorized(
                    input_ids=input_ids,
                    vocab_size=config.vocab_size,
                    hidden_size=config.hidden_size,
                    embedding_size=config.embedding_size,
                    use_one_hot_embedding=use_one_hot_embeddings,
                    initializer_range=config.initializer_range,
                    word_embedding_name='word_embeddings')

                
                # 2. add positional embeddings
                self.embedding_output = _mh.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=False,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.token_type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_positional_embeddings=True,
                    positional_embedding_type=config.pre_positional_embedding_type,
                    pre_positional_embeddings=pre_positional_embeddings,
                    positional_embedding_name='position_embeddings',
                    initializer_range=config.initializer_range,
                    max_positional_embeddings=config.max_positional_embeddings,
                    dropout_prob=config.hidden_dropout_prob)
                

            # Encoder
            with tf.variable_scope('encoder'):
                # obtain the mask
                # ATTENTION: do not use the original mask method, see more in the comments below this class. (not for this lm task)
                # attention_mask = _mh.create_attention_mask_from_input_mask(input_ids, input_mask)
                attention_mask = input_mask

                self.all_encoder_layers = tranformer_model(input_tensor=self.embedding_output,
                                                           attention_mask=attention_mask,
                                                           hidden_size=config.hidden_size,
                                                           num_hidden_layers=config.num_hidden_layers,
                                                           num_attention_heads=config.num_attention_heads,
                                                           intermediate_size=config.intermediate_size,
                                                           intermediate_act_fn=_mh.gelu,
                                                           hidden_dropout_prob=config.hidden_dropout_prob,
                                                           attention_probs_dropout_prob=config.attention_probs_dropout_prob,
                                                           initializer_range=config.initializer_range,
                                                           do_return_all_layers=True,
                                                           share_parameter_across_layers=False)
                
            self.sequence_output = self.all_encoder_layers[-1]
            
            # for classification task
            with tf.variable_scope('pooler'):
                # [batch_size, seq_length, hidden_size] -> [batch_size, hidden_size]
                first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=_mh.create_initializer(config.initializer_range))
示例#6
0
    def model_fn(features, labels, mode, params):
        _info('*** Features ***')
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

        input_ids = features['input_ids']       # [batch_size, seq_length]

        # build model
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        model = BertModelOfficial(
            config=bert_config,
            is_training=is_training,
            input_ids=input_ids)
   
        # [b, s, h]
        sequence_output = model.get_pooled_output()
        # sequence_output = tf.reshape(sequence_output, 
        #                         [-1, bert_config.max_length * bert_config.hidden_size])
        _info(sequence_output.shape)
        with tf.variable_scope('prediction'):
            logits  = tf.layers.dense(sequence_output, 
                                  bert_config.classes,
                                  name='prediction',
                                  kernel_initializer=_mh.create_initializer(0.2))
      
            # logits = _mh.batch_norm(logits, is_training=is_training)
            prob = tf.nn.softmax(logits, axis=-1)       # [b, 2]
            predict_ids = tf.argmax(prob, axis=-1)    # [b, ]

            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {'class': predict_ids}
                # the default key in 'output', however, when customized, the keys are identical with the keys in dict.
                output_spec = tf.estimator.EstimatorSpec(mode, predictions=predictions)
            else:
                if mode == tf.estimator.ModeKeys.TRAIN:
                    tvars = tf.trainable_variables()
                    initialized_variable_names = {}
                    if init_checkpoint:
                        (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
                        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

                    _info('*** Trainable Variables ***')
                    for var in tvars:
                        init_string = ''
                        if var.name in initialized_variable_names:
                            init_string = ', *INIT_FROM_CKPT*'
                        _info('name = {}, shape={}{}'.format(var.name, var.shape, init_string))


                    batch_size = tf.cast(bert_config.batch_size, tf.float32) 

                    labels = tf.reshape(labels, [-1])
    
                    # logits = tf.expand_dims(logits, axis=1)
                    seq_loss = tf.reduce_sum(
                            tf.nn.sparse_softmax_cross_entropy_with_logits(
                                labels=labels, logits=logits)) / batch_size
                    loss = seq_loss
                    """
                    Tutorial on `polynomial_decay`:
                        The formula is as below:
                            
                            global_step = min(global_step, decay_steps)
                            decayed_learning_rate = (learning_rate - end_learning_rate) * (1 - global_step / decay_steps) ^ (power) + end_learning_rate
                        
                        global_step: each batch step.
                        decay_steps: the whole step, the lr will touch the end_learning_rate after the decay_steps.
                        TRAIN_STEPS: the number for repeating the whole dataset, so the decay_steps = len(dataset) / batch_size * TRAIN_STEPS.
                    """
                    train_op, lr = optimization.create_optimizer(loss, bert_config.learning_rate, bert_config.num_train_steps * 100, bert_config.lr_limit)
                    """
                    learning_rate = tf.train.polynomial_decay(config.learning_rate,
                                                            tf.train.get_or_create_global_step(),
                                                            _cg.TRIAN_STEPS,
                                                            end_learning_rate=0.0,
                                                            power=1.0,
                                                            cycle=False)

                    lr = tf.maximum(tf.constant(config.lr_limit), learning_rate)
                    optimizer = tf.train.AdamOptimizer(lr, name='optimizer')
                    tvars = tf.trainable_variables()
                    gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=config.colocate_gradients_with_ops)
                    clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                    train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step())
                    """

                    # this is excellent, because it could display the result each step, i.e., each step equals to batch_size.
                    # the output_spec, display the result every save checkpoints step.
                    logging_hook = tf.train.LoggingTensorHook({'loss' : loss, 'lr': lr}, every_n_iter=10)

                    output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])

                elif mode == tf.estimator.ModeKeys.EVAL:
                    # TODO
                    raise NotImplementedError
            
            return output_spec
    def build(self, sent_A, sent_B, sent_length_A, sent_length_B, scope=None):
        # RNN Encoder
        encoder_outputs_A = RNNEncoder(sent_A, sent_length_A, self.vocab_size,
                                       self.embedding_size, self.num_layers,
                                       self.hidden_size, self.forget_bias,
                                       self.dropout, self.initializer_range)
        encoder_outputs_B = RNNEncoder(sent_B, sent_length_B, self.vocab_size,
                                       self.embedding_size, self.num_layers,
                                       self.hidden_size, self.forget_bias,
                                       self.dropout, self.initializer_range)

        # CNN
        cnn_output_A = CNNExtractor(encoder_outputs_A, self.kernel_size,
                                    self.pool_size, self.dropout,
                                    self.initializer_range)
        cnn_output_B = CNNExtractor(encoder_outputs_B, self.kernel_size,
                                    self.pool_size, self.dropout,
                                    self.initializer_range)

        # Attention
        attention_A = AttentionLayer(encoder_outputs_A, encoder_outputs_B)
        attention_B = AttentionLayer(encoder_outputs_B, encoder_outputs_A)

        # Max and Mean on the concatenate of the encoder outputs and the attention outputs
        V_a = tf.concat(
            (encoder_outputs_A, attention_A, encoder_outputs_A - attention_A,
             tf.multiply(encoder_outputs_A, attention_A)),
            axis=-1)
        V_b = tf.concat(
            (encoder_outputs_B, attention_B, encoder_outputs_B - attention_B,
             tf.multiply(encoder_outputs_B, attention_B)),
            axis=-1)
        v_a_max = tf.reduce_max(V_a, axis=-1)
        v_a_avg = tf.reduce_mean(V_a, axis=-1)
        v_b_max = tf.reduce_max(V_b, axis=-1)
        v_b_avg = tf.reduce_mean(V_b, axis=-1)

        # concatenate the final output
        # (8*s_a -8)
        output_a = tf.concat((v_a_max, cnn_output_A, v_a_avg), axis=-1)
        # (8*s_b -8)
        output_b = tf.concat((v_b_max, cnn_output_B, v_b_avg), axis=-1)

        output = self.similarity_model(output_a, output_b)

        with tf.variable_scope('prediction'):
            layer_size = _mh.get_shape_list(output)[1] // 2
            output = tf.layers.dense(
                output,
                layer_size,
                activation=tf.nn.tanh,
                name='layer_mid',
                kernel_initializer=_mh.create_initializer(
                    initializer_range=self.initializer_range))
            output = tf.layers.dense(
                output,
                2,
                activation=tf.nn.tanh,
                name='layer_final',
                kernel_initializer=_mh.create_initializer(
                    initializer_range=self.initializer_range))

        return output
示例#8
0
def tranformer_model(input_tensor,
                     attention_mask=None,
                     hidden_size=1024,
                     num_hidden_layers=12,
                     num_attention_heads=12,
                     intermediate_size=3072,
                     intermediate_act_fn=_mh.gelu,
                     hidden_dropout_prob=0.1,
                     attention_probs_dropout_prob=0.1,
                     initializer_range=0.02,
                     do_return_all_layers=False,
                     share_parameter_across_layers=True):
    """Multi-head, multi-layer Transformer.
    
    Args:
        input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
        attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length],
            where 1 indicates the position can be attended and 0 indicates the position cannot be attended.
        hidden_size: int. Hidden size of the Transformer.
        num_hidden_layers: int. Number of layers in the Transformer.
        num_attention_heads: int. Number of attention heads in the Transformer.
        intermediate_size: int. The size of the feed forward layer.
        intermediate_act_fn: activation function after feed forward layer.
        hidden_dropout_prob: float.
        attention_probs_dropout_prob: float.
        initializer_range: float.
        do_return_all_layers: bool. Return the output from all the hidden layers or just the final layer.
        share_parameter_across_layers: bool. Whether share parameters across each attention layer.

    Returns:
        float Tensor of shape [batch_size, seq_length, hidden_size],
        or a list contains 'num_hidden_layers' float Tensor.
    """
    if hidden_size % num_attention_heads != 0:
        _error(
            'The hidden size {} cannot be divided by the number of attention heads {}'
            .format(hidden_size, num_attention_heads))
        raise ValueError

    # the hidden size for each head
    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = _mh.get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    # residual layer need to perform on the outputs from all layers,
    # so the hidden size, i.e. the outputs from the transformer blocks
    # should be the same as the input_width, at the beginning, it is input tensor,
    # diffetentiate hidden_size from the intermediate_size,
    # intermediate layer is before the hidden layer.
    if input_width != hidden_size:
        _error(
            'The width of the input tensor {} not not equal to the hidden size {}'
            .format(input_width, hidden_size))
        raise ValueError

    # create a list to save the output from each transformer layer]
    prev_output = input_tensor  # [batch_size, seq_length, width]
    all_layer_outputs = []
    for layer_idx in range(num_hidden_layers):
        if share_parameter_across_layers:
            name_variable_scope = 'layer_shared'
        else:
            name_variable_scope = 'layer_{}'.format(layer_idx)

        # share the parameter across layers when share_parameter_across_layers us True and not the first layer
        with tf.variable_scope(
                name_variable_scope,
                reuse=True if
            (share_parameter_across_layers and layer_idx > 0) else False):
            layer_input = prev_output
            with tf.variable_scope('attention'):
                attention_heads = []
                with tf.variable_scope('self'):
                    attention_head = self_attention_layer(
                        from_tensor=layer_input,
                        to_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        attention_probs_dropout_prob=
                        attention_probs_dropout_prob,
                        initializer_range=initializer_range,
                        batch_size=batch_size,
                        from_seq_length=seq_length,
                        to_seq_length=seq_length)
                attention_output = attention_head
                # perform residual layer to finish the self-attention block
                with tf.variable_scope('output'):
                    attention_output = tf.layers.dense(
                        attention_output,
                        hidden_size,
                        kernel_initializer=_mh.create_initializer(
                            initializer_range))
                    attention_output = _mh.dropout(attention_output,
                                                   hidden_dropout_prob)
                    attention_output = _mh.layer_norm(attention_output +
                                                      layer_input)

            # do double linear projection to enhance the context representation
            with tf.variable_scope('intermediate'):
                intermediate_output = tf.layers.dense(
                    attention_output,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=_mh.create_initializer(
                        initializer_range))

            with tf.variable_scope('output'):
                layer_output = tf.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=_mh.create_initializer(
                        initializer_range))
                layer_output = _mh.dropout(layer_output, hidden_dropout_prob)
                layer_output = _mh.layer_norm(layer_output + attention_output)
                prev_output = layer_output
                all_layer_outputs.append(layer_output)

    if do_return_all_layers:
        return all_layer_outputs
    else:
        return all_layer_outputs[-1]
示例#9
0
def self_attention_layer(from_tensor,
                         to_tensor,
                         attention_mask=None,
                         num_attention_heads=1,
                         size_per_head=512,
                         query_act=None,
                         key_act=None,
                         value_act=None,
                         attention_probs_dropout_prob=0.0,
                         initializer_range=0.02,
                         batch_size=None,
                         from_seq_length=None,
                         to_seq_length=None):
    """Perform self-attention.
    
    Args:
        from_tensor: float Tensor of shape [batch_size, seq_length, width].
        to_tensor: float Tensor of shape [batch_size, seq_length, width].
        attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length],
            where 1 indicates the position can be attended and 0 indicates the position cannot be attended.
        num_attention_heads: int. Number of attention heads in the Transformer.
        size_per_head: int. Size of each attention head.
        query_act: (optional) Activation function for the query transformer.
        key_act: (optional) Activation function for the key transformer.
        value_act: (optional) Activation function for the value transformer.
        attention_probs_dropout_prob: (optional) float.
        initializer_range: float.
        batch_size: (optional) int.
        from_seq_length: (optional) int.
        to_seq_length: (optional) int.
    
    Returns:
        float Tensor of shape [batch_size, from_seq_length, width].
    """
    def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                             seq_length, size_per_head):
        """Change the order of axes. witdh = num_attention_heads * size_per_head.
        
        Args:
            input_tensor: float Tensor of shape [batch_size, seq_length, width].

        Returns:
            float Tensor of shape [batch_size, num_attention_heads, seq_length, size_per_head].
        """
        output_tensor = tf.reshape(
            input_tensor,
            [batch_size, seq_length, num_attention_heads, size_per_head])
        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor

    # check the rank
    from_shape = _mh.get_shape_list(from_tensor, expected_rank=3)
    to_shape = _mh.get_shape_list(to_tensor, expected_rank=3)

    if len(from_shape) != len(to_shape) != 3:
        _error(
            'The rank of `from_tensor` should match the rank of `to_tensor`, and should be 3'
        )
        raise ValueError

    # calculate the query, key, value
    # from_tensor: [batch_size, seq_length, width] -> query_layer: [batch_size, seq_length, num_attention_heads * size_per_head]
    # num_attention_heads * size_per_head == hidden_size == width
    query_layer = tf.layers.dense(
        from_tensor,
        num_attention_heads * size_per_head,
        activation=query_act,
        name='query',
        kernel_initializer=_mh.create_initializer(initializer_range))

    key_layer = tf.layers.dense(
        to_tensor,
        num_attention_heads * size_per_head,
        activation=key_act,
        name='key',
        kernel_initializer=_mh.create_initializer(initializer_range))

    value_layer = tf.layers.dense(
        to_tensor,
        num_attention_heads * size_per_head,
        activation=value_act,
        name='value',
        kernel_initializer=_mh.create_initializer(initializer_range))

    # [batch_size, seq_length, width] -> [batch_size, num_attention_heads, seq_length, size_per_head]
    query_layer = transpose_for_scores(query_layer, batch_size,
                                       num_attention_heads, from_seq_length,
                                       size_per_head)
    key_layer = transpose_for_scores(key_layer, batch_size,
                                     num_attention_heads, to_seq_length,
                                     size_per_head)

    # calculate the attention scores
    # [batch_size, num_attention_heads, from_seq_length, to_seq_length]
    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(size_per_head)))

    if attention_mask is not None:
        # [batch_size, seq_length, seq_length] -> [batch_size, 1, seq_length, seq_length]
        attention_mask = tf.expand_dims(attention_mask, axis=1)
        adder = (1.0 - tf.cast(attention_mask, dtype=tf.float32)) * -10000.0
        attention_scores += adder

    attention_probs = tf.nn.softmax(attention_scores)
    attention_probs = _mh.dropout(attention_probs,
                                  attention_probs_dropout_prob)

    # calculate the context layer
    # [batch_size, num_attention_heads, to_seq_length, size_per_head]
    value_layer = transpose_for_scores(value_layer, batch_size,
                                       num_attention_heads, to_seq_length,
                                       size_per_head)
    context_layer = tf.matmul(attention_scores, value_layer)
    # [batch_size, from_seq_length, num_attention_heads, size_per_head]
    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
    # [batch_size, from_seq_length, width]
    context_layer = tf.reshape(
        context_layer,
        [batch_size, from_seq_length, num_attention_heads * size_per_head])

    return context_layer