예제 #1
0
def test_warp():
    B, T, D = get_dim_vars('b t d')

    x: 'btd' = np.ones((B, T, D))

    # two view transformations (reshapes) in sequence
    x1 = warp(x, 'btd -> b,t,4,d//4 -> b*t,4,d//4', 'vv', debug=False)
    assert (x1.shape == (B * T, 4, D // 4))

    # four reshapes in sequence
    x2 = warp(x,
              'btd -> b,t,4,d//4 -> b*t,4,d//4 -> b,t,4,d//4 -> btd',
              'vvvv',
              debug=False)
    assert (x2.shape == (B, T, D))

    # Same reshape sequence in shorthand, specified as list of transformations
    x2 = warp(x, [
        '__d -> ,,4,d//4', 'b,t,, -> b*t,,', 'b*t,, -> b,t,,',
        ',,4,d//4 -> ,,d'
    ],
              'vvvv',
              debug=True)
    assert (x2.shape == (B, T, D))

    print('test_warp: all assertions hold')
예제 #2
0
def warp_long1():
    B, T, D, C = get_dim_vars('b t d c')
    x1: 'btd' = np.ones((B, T, D))
    x2: 'btd' = np.ones((B, T, D))
    x3: 'btd' = np.ones((B, T, D))
    y = warp([x1, x2, x3], '(btd)* -> btdc -> bdtc -> b,d//2,t*2,c', 'jpv')
    assert y.shape == (B, D // 2, T * 2, C)
    print('warp_long1: all assertions hold')
예제 #3
0
def test_warp():
    x: 'btd' = np.ones((B, T, D))
    #x = warp(x, 'btd -> b,t,4,d//4 -> b*t,4,d//4', 'vv', debug=True)
    #assert(x.shape == (B*T,4,D//4))

    x = warp(x,
             'btd -> b,t,4,d//4 -> b*t,4,d//4 -> b,t,4,d//4 -> btd',
             'vvvv',
             debug=False)
    assert (x.shape == (B, T, D))

    import torch
    y: 'btd' = torch.randn(B, T, D)
    y = warp(y, 'btd -> b,t,4,d//4 -> b,4,t,d//4', 'vp', debug=False)
    assert (y.shape == (B, 4, T, D // 4))

    print('test_warp: all assertions hold')
예제 #4
0
def test_warp_pytorch():
    B, T, D = get_dim_vars('b t d')

    import torch
    y: 'btd' = torch.randn(B, T, D)
    #a reshape followed by permute
    y = warp(y, 'btd -> b,t,4,d//4 -> b,4,t,d//4', 'vp', debug=False)
    assert (y.shape == (B, 4, T, D // 4))

    print('test_warp_pytorch: all assertions hold')
예제 #5
0
def embedding_lookup(input_ids,
                     vocab_size,
                     embedding_size=128,
                     initializer_range=0.02,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=False):
    """Looks up words embeddings for id tensor.

  Args:
    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
      ids.
    vocab_size: int. Size of the embedding vocabulary.
    embedding_size: int. Width of the word embeddings.
    initializer_range: float. Embedding initialization range.
    word_embedding_name: string. Name of the embedding table.
    use_one_hot_embeddings: bool. If True, use one-hot method for word
      embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
      for TPUs.

  Returns:
    float Tensor of shape [batch_size, seq_length, embedding_size].
  """
    # This function assumes that the input is of shape [batch_size, seq_length,
    # num_inputs].
    #
    # If the input is a 2D tensor of shape [batch_size, seq_length], we
    # reshape to [batch_size, seq_length, 1].
    if input_ids.shape.ndims == 2:
        input_ids = tf.expand_dims(input_ids, axis=[-1])

    B, T, D = get_dim_vars('b t d')

    input_ids: 'bti'  #i : num of inputs
    #TODO: define/pickup i from input_ids
    i = get_shape_list(input_ids)[-1]

    embedding_table: 'vd' = tf.get_variable(
        name=word_embedding_name,
        shape=[vocab_size, embedding_size],
        initializer=create_initializer(initializer_range))

    if use_one_hot_embeddings:
        flat_input_ids: 'b*t*i' = tf.reshape(input_ids, [-1])
        one_hot_input_ids: 'b*t*i,v' = tf.one_hot(flat_input_ids,
                                                  depth=vocab_size)
        output: 'b*t*i,d' = tf.matmul(one_hot_input_ids, embedding_table)
    else:
        output = tf.nn.embedding_lookup(embedding_table, input_ids)

    #input_shape: 'bti' = get_shape_list(input_ids)

    output: 'btd' = warp(output, tfms=f'b*t*{i},d -> b,t,d*{i}', tfm_names='r')

    return (output, embedding_table)
예제 #6
0
def warp_long2():
    B, T, D, C = get_dim_vars('b t d c')
    x1: 'btd' = np.ones((B, T, D))
    y = warp(x1, 'btd -> btd1 -> bdt1 -> b,d//2,t*2,1', 'apv')
    assert y.shape == (B, D // 2, T * 2, 1)
    print('warp_long2: all assertions hold')
예제 #7
0
 def merge_heads(self, x: (B, H, T, D)):
     # pylint: disable=no-self-use
     res = warp(x, 'bhtd -> bthd -> b,t,h*d',
                'pcv')  #permute, then contiguous, then view transforms
     return res
예제 #8
0
def merge_heads2(x: (B,H,T,D)):
    res: (B,T,H*D) = warp(x, 'bhtd -> bthd -> b,t,h*d', 'pcv', debug=False)
    return res
예제 #9
0
def transformer_model(input_tensor: 'btd',
                      attention_mask: 'btt' = None,
                      hidden_size: 'd' = 768,
                      num_hidden_layers: 'l' = 12,
                      num_attention_heads: 'h' = 4,
                      intermediate_size: 's' = 3072,
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False):
    """Multi-headed, multi-layer Transformer from "Attention is All You Need".

  This is almost an exact implementation of the original Transformer encoder.

  See the original paper:
  https://arxiv.org/abs/1706.03762

  Also see:
  https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
      seq_length], with 1 for positions that can be attended to and 0 in
      positions that should not be.
    hidden_size: int. Hidden size of the Transformer.
    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
    num_attention_heads: int. Number of attention heads in the Transformer.
    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
      forward) layer.
    intermediate_act_fn: function. The non-linear activation function to apply
      to the output of the intermediate/feed-forward layer.
    hidden_dropout_prob: float. Dropout probability for the hidden layers.
    attention_probs_dropout_prob: float. Dropout probability of the attention
      probabilities.
    initializer_range: float. Range of the initializer (stddev of truncated
      normal).
    do_return_all_layers: Whether to also return all layers or just the final
      layer.

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.

  Raises:
    ValueError: A Tensor shape or parameter is invalid.
  """
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            "The hidden size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (hidden_size, num_attention_heads))

    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    #batch_size = input_shape[0]
    #seq_length = input_shape[1]
    #input_width = input_shape[2]

    B, T, D = input_shape
    batch_size, seq_length, input_width = B, T, D

    # The Transformer performs sum residuals on all layers so the input needs
    # to be the same as the hidden size.
    if D != hidden_size:
        raise ValueError(
            "The width of the input tensor (%d) != hidden size (%d)" %
            (D, hidden_size))

    # We keep the representation as a 2D tensor to avoid re-shaping it back and
    # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
    # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
    # help the optimizer.
    #prev_output: 'b*t,d' = reshape_to_matrix(input_tensor)
    prev_output: 'b*t,d' = warp(input_tensor, 'btd -> b*t,d', 'v')
    size_assert(get_shape_list(prev_output), (B * T, D))

    all_layer_outputs: '(btd)*' = []
    for layer_idx in range(num_hidden_layers):
        with tf.variable_scope("layer_%d" % layer_idx):
            layer_input: 'b*t,d' = prev_output

            with tf.variable_scope("attention"):
                attention_heads: '(b*t,d)*' = []
                with tf.variable_scope("self"):
                    attention_head: 'b*t,d' = attention_layer(
                        from_tensor=layer_input,
                        to_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        attention_probs_dropout_prob=
                        attention_probs_dropout_prob,
                        initializer_range=initializer_range,
                        do_return_2d_tensor=True,
                        batch_size=batch_size,
                        from_seq_length=seq_length,
                        to_seq_length=seq_length)
                    attention_heads.append(attention_head)

                attention_output = None
                if len(attention_heads) == 1:
                    attention_output = attention_heads[0]
                else:
                    # In the case where we have other sequences, we just concatenate
                    # them to the self-attention head before the projection.
                    attention_output: 'b*t,d' = tf.concat(attention_heads,
                                                          axis=-1)

                # Run a linear projection of `hidden_size` then add a residual
                # with `layer_input`.
                with tf.variable_scope("output"):
                    attention_output: 'b*t,d' = tf.layers.dense(
                        attention_output,
                        hidden_size,
                        kernel_initializer=create_initializer(
                            initializer_range))
                    attention_output = dropout(attention_output,
                                               hidden_dropout_prob)
                    attention_output = layer_norm(attention_output +
                                                  layer_input)

            # The activation is only applied to the "intermediate" hidden layer.
            with tf.variable_scope("intermediate"):
                intermediate_output: 'b*t,s' = tf.layers.dense(
                    attention_output,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=create_initializer(initializer_range))

            # Down-project back to `hidden_size` then add the residual.
            with tf.variable_scope("output"):
                layer_output: 'b*t,d' = tf.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=create_initializer(initializer_range))
                layer_output = dropout(layer_output, hidden_dropout_prob)
                layer_output = layer_norm(layer_output + attention_output)
                prev_output: 'b*t,d' = layer_output
                all_layer_outputs.append(layer_output)

    if do_return_all_layers:
        final_outputs: '(btd)*' = []
        for layer_output in all_layer_outputs:
            #final_output: 'btd' = reshape_from_matrix(layer_output, input_shape)
            final_output: 'btd' = warp(layer_output, 'b*t,d -> btd', 'r')
            final_outputs.append(final_output)
        return final_outputs
    else:
        final_output: 'btd' = warp(layer_output, 'b*t,d -> btd', 'r')
        return final_output
예제 #10
0
def attention_layer(from_tensor: 'b*t,d',
                    to_tensor: 'b*t,d',
                    attention_mask: 'b,t,t' = None,
                    num_attention_heads=1,
                    size_per_head=512,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    do_return_2d_tensor=False,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None):
    """Performs multi-headed attention from `from_tensor` to `to_tensor`.

  This is an implementation of multi-headed attention based on "Attention
  is all you Need". If `from_tensor` and `to_tensor` are the same, then
  this is self-attention. Each timestep in `from_tensor` attends to the
  corresponding sequence in `to_tensor`, and returns a fixed-with vector.

  This function first projects `from_tensor` into a "query" tensor and
  `to_tensor` into "key" and "value" tensors. These are (effectively) a list
  of tensors of length `num_attention_heads`, where each tensor is of shape
  [batch_size, seq_length, size_per_head].

  Then, the query and key tensors are dot-producted and scaled. These are
  softmaxed to obtain attention probabilities. The value tensors are then
  interpolated by these probabilities, then concatenated back to a single
  tensor and returned.

  In practice, the multi-headed attention are done with transposes and
  reshapes rather than actual separate tensors.

  Args:
    from_tensor: float Tensor of shape [batch_size, from_seq_length,
      from_width].
    to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
    attention_mask: (optional) int32 Tensor of shape [batch_size,
      from_seq_length, to_seq_length]. The values should be 1 or 0. The
      attention scores will effectively be set to -infinity for any positions in
      the mask that are 0, and will be unchanged for positions that are 1.
    num_attention_heads: int. Number of attention heads.
    size_per_head: int. Size of each attention head.
    query_act: (optional) Activation function for the query transform.
    key_act: (optional) Activation function for the key transform.
    value_act: (optional) Activation function for the value transform.
    attention_probs_dropout_prob: (optional) float. Dropout probability of the
      attention probabilities.
    initializer_range: float. Range of the weight initializer.
    do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
      * from_seq_length, num_attention_heads * size_per_head]. If False, the
      output will be of shape [batch_size, from_seq_length, num_attention_heads
      * size_per_head].
    batch_size: (Optional) int. If the input is 2D, this might be the batch size
      of the 3D version of the `from_tensor` and `to_tensor`.
    from_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `from_tensor`.
    to_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `to_tensor`.

  Returns:
    float Tensor of shape [batch_size, from_seq_length,
      num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
      true, this will be of shape [batch_size * from_seq_length,
      num_attention_heads * size_per_head]).

  Raises:
    ValueError: Any of the arguments or tensor shapes are invalid.
  """

    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])

    if len(from_shape) != len(to_shape):
        raise ValueError(
            "The rank of `from_tensor` must match the rank of `to_tensor`.")

    if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
    elif len(from_shape) == 2:
        if (batch_size is None or from_seq_length is None
                or to_seq_length is None):
            raise ValueError(
                "When passing in rank 2 tensors to attention_layer, the values "
                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
                "must all be specified.")

    # Scalar dimensions referenced here:
    #   B(b) = batch size (number of sequences)
    #   F(f) = `from_tensor` sequence length
    #   T(t) = `to_tensor` sequence length
    #   N(n) = `num_attention_heads`
    #   H(h) = `size_per_head`

    #from_tensor_2d: 'b*t,d' = reshape_to_matrix(from_tensor)
    #to_tensor_2d: 'b*t,d' = reshape_to_matrix(to_tensor)

    from_tensor_2d: 'b*t,d' = from_tensor
    to_tensor_2d: 'b*t,d' = to_tensor

    query_layer: 'b*t,d' = tf.layers.dense(
        from_tensor_2d,
        num_attention_heads * size_per_head,
        activation=query_act,
        name="query",
        kernel_initializer=create_initializer(initializer_range))

    key_layer: 'b*t,d' = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=key_act,
        name="key",
        kernel_initializer=create_initializer(initializer_range))

    value_layer: 'b*t,d' = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=value_act,
        name="value",
        kernel_initializer=create_initializer(initializer_range))

    query_layer: 'bnth' = warp(query_layer, 'b*t,d -> btnh -> bnth', 'vp')
    key_layer: 'bnth' = warp(key_layer, 'b*t,d -> btnh -> bnth', 'vp')

    # Take the dot product between "query" and "key" to get the raw
    # attention scores.
    attention_scores: 'bntt' = tf.matmul(query_layer,
                                         key_layer,
                                         transpose_b=True)
    attention_scores: 'bntt' = tf.multiply(
        attention_scores, 1.0 / math.sqrt(float(size_per_head)))

    if attention_mask is not None:
        # `attention_mask` = [B, 1, F, T]
        #attention_mask = tf.expand_dims(attention_mask, axis=[1])
        attention_mask = alignto((attention_mask, 'btt'), 'bntt')

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        adder: 'bntt' = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0

        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        attention_scores += adder

    # Normalize the attention scores to probabilities.
    # `attention_probs` = [B, N, F, T]
    attention_probs: 'bntt' = tf.nn.softmax(attention_scores)

    # This is actually dropping out entire tokens to attend to, which might
    # seem a bit unusual, but is taken from the original Transformer paper.
    attention_probs: 'bntt' = dropout(attention_probs,
                                      attention_probs_dropout_prob)

    value_layer: 'bnth' = warp(value_layer, 'b*t,n*h -> btnh -> bnth', 'vp')

    context_layer: 'bnth' = tf.matmul(
        attention_probs, value_layer)  #bntt,bnth->bnth OR ___t,__t_

    if do_return_2d_tensor:
        context_layer = warp(context_layer, 'bnth->btnh->b*t,n*h', 'pv')
    else:
        context_layer = warp(context_layer, 'bnth->btnh->b,t,n*h', 'pv')

    return context_layer