예제 #1
0
def create_attention_mask_from_input_mask(from_tensor, to_mask):
    """Create 3D attention mask from a 2D tensor mask.
    Args:
      from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
      to_mask: int32 Tensor of shape [batch_size, to_seq_length].
    Returns:
      float Tensor of shape [batch_size, from_seq_length, to_seq_length].
    """
    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
    batch_size = from_shape[0]
    from_seq_length = from_shape[1]

    to_shape = get_shape_list(to_mask, expected_rank=2)
    to_seq_length = to_shape[1]

    to_mask = tf.cast(tf.reshape(to_mask, [batch_size, 1, to_seq_length]),
                      tf.float32)

    # We don't assume that `from_tensor` is a mask (although it could be). We
    # don't actually care if we attend *from* padding tokens (only *to* padding)
    # tokens so we create a tensor of all ones.
    #
    # `broadcast_ones` = [batch_size, from_seq_length, 1]
    broadcast_ones = tf.ones(shape=[batch_size, from_seq_length, 1],
                             dtype=tf.float32)

    # Here we broadcast along two dimensions to create the mask.
    mask = broadcast_ones * to_mask

    return mask
예제 #2
0
def create_similar_model(model_config,
                         is_training,
                         input_ids_a,
                         input_mask_a,
                         segment_ids_a,
                         input_ids_b,
                         input_mask_b,
                         segment_ids_b,
                         label,
                         embedding_table=None,
                         hidden_dropout_prob=0.1,
                         use_one_hot_embeddings=False):
    """Creates a classification model."""
    model_a = modeling.TextEncoder(config=model_config,
                                   is_training=is_training,
                                   input_ids=input_ids_a,
                                   embedding_table=embedding_table,
                                   input_mask=input_mask_a,
                                   token_type_ids=segment_ids_a)

    model_b = modeling.TextEncoder(config=model_config,
                                   is_training=is_training,
                                   input_ids=input_ids_b,
                                   embedding_table=embedding_table,
                                   input_mask=input_mask_b,
                                   token_type_ids=segment_ids_b)

    sequence_output_a = model_a.get_sequence_output()
    sequence_output_b = model_b.get_sequence_output()

    seq_out_shape = get_shape_list(sequence_output_a)
    batch_size = seq_out_shape[0]

    text_representation_a = tf.reduce_mean(sequence_output_a, axis=1)
    text_representation_b = tf.reduce_mean(sequence_output_b, axis=1)

    with tf.variable_scope("loss"):
        normalize_a = tf.math.l2_normalize(text_representation_a, axis=-1)
        normalize_b = tf.math.l2_normalize(text_representation_b, axis=-1)
        # a_shape = get_shape_list(normalize_a)
        # b_shape = get_shape_list(normalize_b)
        # tf.logging.info("a_shape:%s, b_shape:%s"%(str(a_shape), str(b_shape)))
        # cosine_dist = tf.losses.cosine_distance(normalize_a, normalize_b, axis=-1)
        # label_shape = get_shape_list(label)
        # cosine_dist_shape = get_shape_list(cosine_dist)
        # tf.logging.info("label_shape:%s, cosine_dist_shape:%s"%(str(label_shape), str(cosine_dist_shape)))
        normalize_a = tf.reshape(normalize_a, shape=[batch_size, 1, -1])
        normalize_b = tf.reshape(normalize_b, shape=[batch_size, 1, -1])
        cosine = tf.matmul(normalize_a, normalize_b, transpose_b=True)
        cosine = tf.reshape(cosine, shape=[batch_size])
        # cosine = tf.tensordot(normalize_a, normalize_b, axes=[[1],[1]])
        loss = tf.reduce_sum(tf.losses.mean_squared_error(label, cosine))
        return (loss, cosine)
예제 #3
0
def create_model(model_config,
                 is_training,
                 input_ids,
                 input_mask,
                 keyword_mask,
                 segment_ids,
                 embedding_table=None,
                 hidden_dropout_prob=0.1,
                 use_one_hot_embeddings=False):
    """Creates a classification model."""
    model = modeling.TextEncoder(
        config=model_config,
        is_training=is_training,
        input_ids=input_ids,
        embedding_table=embedding_table,
        input_mask=input_mask,
        token_type_ids=segment_ids)

    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    sequence_output = model.get_sequence_output()
    sequence_shape = get_shape_list(sequence_output, expected_rank=3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]

    num_heads = model_config.num_attention_heads
    hidden_size = model_config.hidden_size
    size_per_head = int(hidden_size/num_heads)
    
    prev_output = sequence_output
    for word_layer_idx in range(model_config.word_attn_layer_num):
        layer_input = prev_output
        with tf.variable_scope("word_attn_layer_%d"%(word_layer_idx)):
            attention_head = word_self_attention_layer(layer_input,
                                                        input_mask,
                                                        num_heads,
                                                        size_per_head,
                                                        hidden_size)

            attention_output = tf.layers.dense(
                attention_head,
                hidden_size,
                activation=None,
                kernel_initializer=create_initializer(0.02)
            )
            attention_output = modeling.dropout(attention_output, hidden_dropout_prob)
            prev_output = attention_output
            # attention_output = modeling.layer_norm(attention_output + layer_input)
    tf.logging.info("prev_output shape:%s"%(str(get_shape_list(prev_output))))
    # prev_output shape [batch_size, seq_length, hidden_size]
    # keyword_scores shape [batch_size, seq_length]
    keyword_scores = tf.layers.dense(
        prev_output,
        1,
        activation=None,
        kernel_initializer=create_initializer(0.02),
    )
    tf.logging.info("keyword_scores shape:%s"%(str(get_shape_list(keyword_scores))))
    keyword_scores = tf.reshape(keyword_scores, [batch_size, seq_length])
    # keyword_scores = tf.reduce_sum(prev_output)
    # keyword_scores shape: [batch_size, seq_length]
    mask_adder = (1.0 - tf.cast(input_mask, tf.float32)) * -10000.0
    keyword_scores += mask_adder
    kw_mask_adder = (1.0 - tf.cast(keyword_mask, tf.float32))*-10.0
    keyword_scores += kw_mask_adder
    keyword_probs = tf.nn.softmax(keyword_scores)
    tf.logging.info("mask_adder shape:%s"%(get_shape_list(mask_adder)))
    tf.logging.info("keyword_probs shape:%s"%(get_shape_list(mask_adder)))
    keyword_idx = tf.math.argmax(keyword_probs, axis=1) # [batch_size, 1]
    tf.logging.info("keyword_idx shape:%s"%(str(get_shape_list(keyword_idx))))
    onehot_vec = tf.one_hot(keyword_idx, depth=model_config.max_seq_length) # [batch_size, seq_length, 1]
    onehot_vec_shape = get_shape_list(onehot_vec)
    tf.logging.info("onehot_vec_shape:%s"%(onehot_vec_shape))
    keyword_weight = tf.reshape(onehot_vec, [batch_size, seq_length, 1])
    keyword_vec = tf.reduce_sum(keyword_weight * sequence_output, axis=1)
    tf.logging.info("kwyword_vec shape:%s"%(get_shape_list(keyword_vec)))

    negword_idx = tf.random.uniform(shape=[batch_size,1], minval=0, maxval=model_config.max_seq_length, dtype=tf.int32)
    negword_weights = tf.reshape(tf.one_hot(negword_idx, depth=model_config.max_seq_length), [batch_size, seq_length, 1])
    neg_vec_1 = tf.reduce_sum(negword_weights*sequence_output, axis=1)


    negword_idx = tf.random.uniform(shape=[batch_size,1], minval=0, maxval=model_config.max_seq_length, dtype=tf.int32)
    negword_weights = tf.reshape(tf.one_hot(negword_idx, depth=model_config.max_seq_length), [batch_size, seq_length, 1])
    neg_vec_2 = tf.reduce_sum(negword_weights*sequence_output, axis=1)
    

    with tf.variable_scope("loss"):
        keyword_representation = keyword_vec
        negword_representation = (neg_vec_1 + neg_vec_2)/2
        text_representation = tf.reduce_mean(sequence_output, axis=1)
        # cosine_loss = tf.keras.losses.CosineSimilarity(axis=-1)
        # loss = cosine_loss(keyword_representation, text_representation)
        normalize_a = tf.math.l2_normalize(keyword_representation,axis=-1)
        normalize_b = tf.math.l2_normalize(text_representation,axis=-1)
        normalize_c = tf.math.l2_normalize(negword_representation,axis=-1)
        
        loss_1 = tf.reduce_sum(tf.losses.cosine_distance(normalize_a, normalize_b, axis=-1))
        loss_2 = -tf.reduce_sum(tf.losses.cosine_distance(normalize_a, normalize_c, axis=-1))
        loss = loss_1 + loss_2
        return (loss, text_representation, keyword_probs)  
예제 #4
0
def word_self_attention_layer(input_tensor,
                              input_mask,
                              num_attention_heads,
                              size_per_head,
                              hidden_size,
                              query_act=None,
                              key_act=None,
                              value_act=None,
                              attention_probs_dropout_prob=0.0,
                              initializer_range=0.02):
    """
    Args:
        input_tensor: Float Tensor of shape [batch_size, seq_length, hidden_size]
        input_mask: int Tensor of shape [batch_size, seq_length]
        hidden_size
        asster hidden_size == num_attention_heads * size_per_head
        size_per_head == word_embedding_size
    """
    def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, width):
        output_tensor = tf.reshape(input_tensor,
                                   [batch_size, seq_length, num_attention_heads, width])
        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor

    shape_list = get_shape_list(input_tensor, expected_rank=[2,3])
    batch_size = shape_list[0]
    seq_length = shape_list[1]

    query_layer = tf.layers.dense(
        inputs=input_tensor,
        units=hidden_size,
        activation=query_act,
        name="query",
        kernel_initializer=create_initializer(initializer_range)
    )
    
    key_layer = tf.layers.dense(
        inputs=input_tensor,
        units=hidden_size,
        activation=key_act,
        name="key",
        kernel_initializer=create_initializer(initializer_range)
    )
    
    value_layer = tf.layers.dense(
        inputs=input_tensor,
        units=hidden_size,
        activation=value_act,
        name="value",
        kernel_initializer=create_initializer(initializer_range)
    )
    query_layer = transpose_for_scores(query_layer, batch_size, num_attention_heads, seq_length, size_per_head) 
    key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, seq_length, size_per_head)

    query_shape_list = get_shape_list(query_layer, expected_rank=4)
    tf.logging.info("query_layer shape: %s"%(str(query_shape_list)))
    # query shape [batch_size, seq_length, num_heads, size_per_head]
    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
    attention_scores = tf.multiply(attention_scores, 1.0/math.sqrt(float(size_per_head)))
    # attention_mask shape: [batch_size, seq_length, seq_length]
    attention_mask = modeling.create_attention_mask_from_input_mask(input_tensor, input_mask)
    # expand for multi heads, [batch_size, 1, seq_length, seq_length]
    attention_mask = tf.expand_dims(attention_mask, axis=[1]) 
    mask_adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
    # attention_score: [batch_size, num_heads, seq_length, seq_length]
    attention_scores += mask_adder

    # attention_probs shape: [batch_size, num_heads, seq_length, seq_length]
    attention_probs = tf.nn.softmax(attention_scores)
    attention_probs = modeling.dropout(attention_probs, attention_probs_dropout_prob)
    
    value_layer = tf.reshape(value_layer, [batch_size, seq_length, num_attention_heads, size_per_head])
    # value_layer shape : [batch_size, num_heads, seq_length, size_per_head]
    value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
    context_layer = tf.matmul(attention_probs, value_layer)
    # context_layer shape : [batch_size, seq_length, num_heads, size_per_head]
    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
    context_layer = tf.reshape(context_layer, [batch_size, seq_length, num_attention_heads*size_per_head])
    return context_layer
예제 #5
0
def transformer_model(input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False):
    """Multi-headed, multi-layer Transformer from "Attention is All You Need".
    Args:
      input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
      attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
        seq_length], with 1 for positions that can be attended to and 0 in
        positions that should not be.
      hidden_size: int. Hidden size of the Transformer.
      num_hidden_layers: int. Number of layers (blocks) in the Transformer.
      num_attention_heads: int. Number of attention heads in the Transformer.
      intermediate_size: int. The size of the "intermediate" (a.k.a., feed
        forward) layer.
      intermediate_act_fn: function. The non-linear activation function to apply
        to the output of the intermediate/feed-forward layer.
      hidden_dropout_prob: float. Dropout probability for the hidden layers.
      attention_probs_dropout_prob: float. Dropout probability of the attention
        probabilities.
      initializer_range: float. Range of the initializer (stddev of truncated
        normal).
      do_return_all_layers: Whether to also return all layers or just the final
        layer.
    Returns:
      float Tensor of shape [batch_size, seq_length, hidden_size], the final
      hidden layer of the Transformer.
    Raises:
      ValueError: A Tensor shape or parameter is invalid.
    """
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            "The hidden size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (hidden_size, num_attention_heads))

    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    # The Transformer performs sum residuals on all layers so the input needs
    # to be the same as the hidden size.
    if input_width != hidden_size:
        raise ValueError(
            "The width of the input tensor (%d) != hidden size (%d)" %
            (input_width, hidden_size))

    # We keep the representation as a 2D tensor to avoid re-shaping it back and
    # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
    # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
    # help the optimizer.
    prev_output = reshape_to_matrix(input_tensor)

    all_layer_outputs = []
    for layer_idx in range(num_hidden_layers):
        with tf.variable_scope("layer_%d" % layer_idx):
            layer_input = prev_output

            with tf.variable_scope("attention"):
                attention_heads = []
                with tf.variable_scope("self"):
                    attention_head = attention_layer(
                        from_tensor=layer_input,
                        to_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        query_act=None,
                        key_act=None,
                        value_act=None,
                        attention_probs_dropout_prob=
                        attention_probs_dropout_prob,
                        initializer_range=initializer_range,
                        do_return_2d_tensor=True,
                        batch_size=batch_size,
                        from_seq_length=seq_length,
                        to_seq_length=seq_length)
                    attention_heads.append(attention_head)

                attention_output = None
                if len(attention_heads) == 1:
                    attention_output = attention_heads[0]
                else:
                    # In the case where we have other sequences, we just concatenate
                    # them to the self-attention head before the projection.
                    attention_output = tf.concat(attention_heads, axis=-1)

                # attention_output : [batch_size*seq_length, num_heads*size_per_head]

                # Run a linear projection of `hidden_size` then add a residual
                # with `layer_input`.
                with tf.variable_scope("output"):
                    attention_output = tf.layers.dense(
                        attention_output,
                        hidden_size,
                        kernel_initializer=create_initializer(
                            initializer_range))
                    attention_output = dropout(attention_output,
                                               hidden_dropout_prob)
                    attention_output = layer_norm(attention_output +
                                                  layer_input)

            # The activation is only applied to the "intermediate" hidden layer.
            with tf.variable_scope("intermediate"):
                intermediate_output = tf.layers.dense(
                    attention_output,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=create_initializer(initializer_range))

            # Down-project back to `hidden_size` then add the residual.
            with tf.variable_scope("output"):
                layer_output = tf.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=create_initializer(initializer_range))
                layer_output = dropout(layer_output, hidden_dropout_prob)
                layer_output = layer_norm(layer_output + attention_output)
                prev_output = layer_output
                all_layer_outputs.append(layer_output)

    if do_return_all_layers:
        final_outputs = []
        for layer_output in all_layer_outputs:
            final_output = reshape_from_matrix(layer_output, input_shape)
            final_outputs.append(final_output)
        return final_outputs
    else:
        final_output = reshape_from_matrix(prev_output, input_shape)
        return final_output
예제 #6
0
def attention_layer(from_tensor,
                    to_tensor,
                    attention_mask=None,
                    num_attention_heads=1,
                    size_per_head=512,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    do_return_2d_tensor=False,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None):
    """Performs multi-headed attention from `from_tensor` to `to_tensor`.

    This is an implementation of multi-headed attention based on "Attention
    is all you Need". If `from_tensor` and `to_tensor` are the same, then
    this is self-attention. Each timestep in `from_tensor` attends to the
    corresponding sequence in `to_tensor`, and returns a fixed-with vector.

    This function first projects `from_tensor` into a "query" tensor and
    `to_tensor` into "key" and "value" tensors. These are (effectively) a list
    of tensors of length `num_attention_heads`, where each tensor is of shape
    [batch_size, seq_length, size_per_head].

    Then, the query and key tensors are dot-producted and scaled. These are
    softmaxed to obtain attention probabilities. The value tensors are then
    interpolated by these probabilities, then concatenated back to a single
    tensor and returned.

    In practice, the multi-headed attention are done with transposes and
    reshapes rather than actual separate tensors.

    Args:
      from_tensor: float Tensor of shape [batch_size, from_seq_length,
        from_width].
      to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
      attention_mask: (optional) int32 Tensor of shape [batch_size,
        from_seq_length, to_seq_length]. The values should be 1 or 0. The
        attention scores will effectively be set to -infinity for any positions in
        the mask that are 0, and will be unchanged for positions that are 1.
      num_attention_heads: int. Number of attention heads.
      size_per_head: int. Size of each attention head.
      query_act: (optional) Activation function for the query transform.
      key_act: (optional) Activation function for the key transform.
      value_act: (optional) Activation function for the value transform.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      initializer_range: float. Range of the weight initializer.
      do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
        * from_seq_length, num_attention_heads * size_per_head]. If False, the
        output will be of shape [batch_size, from_seq_length, num_attention_heads
        * size_per_head].
      batch_size: (Optional) int. If the input is 2D, this might be the batch size
        of the 3D version of the `from_tensor` and `to_tensor`.
      from_seq_length: (Optional) If the input is 2D, this might be the seq length
        of the 3D version of the `from_tensor`.
      to_seq_length: (Optional) If the input is 2D, this might be the seq length
        of the 3D version of the `to_tensor`.

    Returns:
      float Tensor of shape [batch_size, from_seq_length,
        num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
        true, this will be of shape [batch_size * from_seq_length,
        num_attention_heads * size_per_head]).

    Raises:
      ValueError: Any of the arguments or tensor shapes are invalid.
    """
    def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                             seq_length, width):
        output_tensor = tf.reshape(
            input_tensor, [batch_size, seq_length, num_attention_heads, width])

        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor

    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])

    if len(from_shape) != len(to_shape):
        raise ValueError(
            "The rank of `from_tensor` must match the rank of `to_tensor`.")

    if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
    elif len(from_shape) == 2:
        if (batch_size is None or from_seq_length is None
                or to_seq_length is None):
            raise ValueError(
                "When passing in rank 2 tensors to attention_layer, the values "
                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
                "must all be specified.")

    # Scalar dimensions referenced here:
    #   B = batch size (number of sequences)
    #   F = `from_tensor` sequence length
    #   T = `to_tensor` sequence length
    #   N = `num_attention_heads`
    #   H = `size_per_head`

    from_tensor_2d = reshape_to_matrix(from_tensor)
    to_tensor_2d = reshape_to_matrix(to_tensor)

    # `query_layer` = [B*F, N*H]
    query_layer = tf.layers.dense(
        from_tensor_2d,
        num_attention_heads * size_per_head,
        activation=query_act,
        name="query",
        kernel_initializer=create_initializer(initializer_range))

    # `key_layer` = [B*T, N*H]
    key_layer = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=key_act,
        name="key",
        kernel_initializer=create_initializer(initializer_range))

    # `value_layer` = [B*T, N*H]
    value_layer = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=value_act,
        name="value",
        kernel_initializer=create_initializer(initializer_range))

    # `query_layer` = [B, N, F, H]
    query_layer = transpose_for_scores(query_layer, batch_size,
                                       num_attention_heads, from_seq_length,
                                       size_per_head)

    # `key_layer` = [B, N, T, H]
    key_layer = transpose_for_scores(key_layer, batch_size,
                                     num_attention_heads, to_seq_length,
                                     size_per_head)

    # Take the dot product between "query" and "key" to get the raw
    # attention scores.
    # `attention_scores` = [B, N, F, T]
    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(size_per_head)))

    if attention_mask is not None:
        # `attention_mask` = [B, 1, F, T]
        attention_mask = tf.expand_dims(attention_mask, axis=[1])

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0

        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        attention_scores += adder

    # Normalize the attention scores to probabilities.
    # `attention_probs` = [B, N, F, T]
    attention_probs = tf.nn.softmax(attention_scores)

    # This is actually dropping out entire tokens to attend to, which might
    # seem a bit unusual, but is taken from the original Transformer paper.
    attention_probs = dropout(attention_probs, attention_probs_dropout_prob)

    # `value_layer` = [B, T, N, H]
    value_layer = tf.reshape(
        value_layer,
        [batch_size, to_seq_length, num_attention_heads, size_per_head])

    # `value_layer` = [B, N, T, H]
    value_layer = tf.transpose(value_layer, [0, 2, 1, 3])

    # `context_layer` = [B, N, F, H]
    context_layer = tf.matmul(attention_probs, value_layer)

    # `context_layer` = [B, F, N, H]
    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

    if do_return_2d_tensor:
        # `context_layer` = [B*F, N*H]
        context_layer = tf.reshape(context_layer, [
            batch_size * from_seq_length, num_attention_heads * size_per_head
        ])
    else:
        # `context_layer` = [B, F, N*H]
        context_layer = tf.reshape(
            context_layer,
            [batch_size, from_seq_length, num_attention_heads * size_per_head])
    return context_layer
예제 #7
0
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 embedding_table,
                 input_mask=None,
                 token_type_ids=None,
                 use_einsum=True,
                 scope=None):
        """构造TextEncoder模型
        Args:
            config: ModelConfig instance
            is_training: bool
            input_ids: int32 Tensor of shape [batch_size, seq_length]
            input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]
            use_einsum: (optional) bool. Whether to use einsum or reshape+matmul for
                dense layers
        """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                      dtype=tf.int32)

        with tf.variable_scope(scope, default_name="TextEncoder"):
            with tf.variable_scope("embeddings"):
                embedding_table_shape = get_shape_list(embedding_table,
                                                       expected_rank=2)
                tf.logging.info("Encoder embedding table shape:%s" %
                                (str(embedding_table_shape)))
                self.embedding_output = tf.nn.embedding_lookup(
                    params=embedding_table, ids=input_ids)
                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.token_type_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

                self.embedding_output = tf.tile(
                    self.embedding_output, [1, 1, config.num_attention_heads])

            with tf.variable_scope("encoder"):
                attention_mask = create_attention_mask_from_input_mask(
                    input_ids, input_mask)
                self.all_encoder_layers = transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

                # [batch_size, max_seq_length, hidden_size]
                self.sequence_output = self.all_encoder_layers[-1]
                #multi_mask = tf.reshape(input_mask, [batch_size, seq_length,1])
                #self.sequence_output = sequence_output * multi_mask

                with tf.variable_scope("pooler"):
                    # [batch_size, seq_length, hidden_size]
                    self.pooled_output = tf.reduce_sum(
                        self.sequence_output,
                        axis=1)  # [batch_size,hidden_size]
예제 #8
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1):
    """Performs various post-processing on a word embedding tensor.

    Args:
        input_tensor: float Tensor of shape [batch_size, seq_length,
        embedding_size].
        use_token_type: bool. Whether to add embeddings for `token_type_ids`.
        token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
        Must be specified if `use_token_type` is True.
        token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
        token_type_embedding_name: string. The name of the embedding table variable
        for token type ids.
        use_position_embeddings: bool. Whether to add position embeddings for the
        position of each token in the sequence.
        position_embedding_name: string. The name of the embedding table variable
        for positional embeddings.
        initializer_range: float. Range of the weight initialization.
        max_position_embeddings: int. Maximum sequence length that might ever be
        used with this model. This can be longer than the sequence length of
        input_tensor, but cannot be shorter.
        dropout_prob: float. Dropout probability applied to the final output tensor.

    Returns:
        float tensor with same shape as `input_tensor`.

    Raises:
        ValueError: One of the tensor shapes or input values is invalid.
    """
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            raise ValueError("`token_type_ids` must be specified if"
                             "`use_token_type` is True.")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # This vocab will be small so we always do one-hot here, since it is always
        # faster for a small vocabulary.
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids,
                                 depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings

    if use_position_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, width],
                initializer=create_initializer(initializer_range))
            # Since the position embedding table is a learned variable, we create it
            # using a (long) sequence length `max_position_embeddings`. The actual
            # sequence length might be shorter than this, for faster training of
            # tasks that do not have long sequences.
            #
            # So `full_position_embeddings` is effectively an embedding table
            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
            # perform a slice.
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(output.shape.as_list())

            # Only the last two dimensions are relevant (`seq_length` and `width`), so
            # we broadcast among the first dimensions, which is typically just
            # the batch size.
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            output += position_embeddings

    output = layer_norm_and_dropout(output, dropout_prob)
    return output