예제 #1
0
def EncoderBlock(d_model, d_ff, n_heads, dropout, layer_idx, mode):
    """Returns a layer sequence that implements a Transformer encoder block.

  The input to the layer sequence is a pair, (activations, mask), where the
  mask was created from the original source tokens to prevent attending to the
  padding part of the input.

  Args:
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    layer_idx: which layer are we at (for bookkeeping)
    mode: str: 'train' or 'eval'

  Returns:
    A sequence of layers that maps an (activations, mask) pair to an
    (activations, mask) pair.
  """
    attention = [
        tl.LayerNorm(),
        tl.Attention(d_model, n_heads=n_heads, dropout=dropout, mode=mode),
        tl.Dropout(rate=dropout, name='enc_attn_dropout', mode=mode),
    ]
    feed_forward = [
        FeedForward(d_model, d_ff, dropout, layer_idx=layer_idx, mode=mode),
    ]
    return tl.Serial(
        tl.Residual(attention),
        tl.Residual(feed_forward),
    )
예제 #2
0
def EncoderBlock(d_model,
                 d_ff,
                 n_heads,
                 dropout,
                 dropout_shared_axes,
                 mode,
                 ff_activation,
                 FeedForwardBlock=FeedForwardBlock):
    """
    Returns a list of layers that implements a Transformer encoder block.
    The input to the layer is a pair, (activations, mask), where the mask was
    created from the original source tokens to prevent attending to the padding
    part of the input.

    Args:
        d_model (int): depth of embedding.
        d_ff (int): depth of feed-forward layer.
        n_heads (int): number of attention heads.
        dropout (float): dropout rate (how much to drop out).
        dropout_shared_axes (int): axes on which to share dropout mask.
        mode (str): 'train' or 'eval'.
        ff_activation (function): the non-linearity in feed-forward layer.
        FeedForwardBlock (function): A function that returns the feed forward block.
    Returns:
        list: A list of layers that maps (activations, mask) to (activations, mask).

    """

    # Attention block
    attention = tl.Attention(
        # dimension of the model
        d_feature=d_model,
        # number of attention heads
        n_heads=n_heads,
        # `dropout`
        dropout=dropout,
        # `mode`
        mode=mode)

    # calling function `FeedForwardBlock
    feed_forward = FeedForwardBlock(d_model, d_ff, dropout,
                                    dropout_shared_axes, mode, ff_activation)

    # Dropout block
    dropout_ = tl.Dropout(rate=dropout,
                          shared_axes=dropout_shared_axes,
                          mode=mode)

    encoder_block = [
        # `Residual` layer
        tl.Residual(
            tl.LayerNorm(),
            attention,
            dropout_,
        ),
        tl.Residual(feed_forward, ),
    ]
    return encoder_block
예제 #3
0
def _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode,
                  ff_activation):
    """Returns a list of layers that implements a Transformer encoder block.

  The input to the block is a pair, (activations, mask), where the mask was
  created from the original source tokens to prevent attending to the padding
  part of the input.

  Args:
    d_model: Final dimension of tensors at most points in the model, including
        the initial embedding output.
    d_ff: Size of special dense layer in the feed-forward part of each block.
    n_heads: Number of attention heads.
    dropout: Stochastic rate (probability) for dropping an activation value
        when applying dropout within a block.
    dropout_shared_axes: Tensor axes on which to share a dropout mask.
        Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
        a useful way to save memory and apply consistent masks to activation
        vectors at different sequence positions.
    mode: If `'train'`, each block will include dropout; else, it will
        pass all values through unaltered.
    ff_activation: Type of activation function at the end of each block; must
        be an activation-type subclass of `Layer`.

  Returns:
    A list of layers that maps (activations, mask) to (activations, mask).
  """
    attention = tl.Attention(d_model,
                             n_heads=n_heads,
                             dropout=dropout,
                             mode=mode)

    feed_forward = _FeedForwardBlock(d_model, d_ff, dropout,
                                     dropout_shared_axes, mode, ff_activation)

    dropout_ = tl.Dropout(rate=dropout,
                          shared_axes=dropout_shared_axes,
                          mode=mode)

    return [
        tl.Residual(
            tl.LayerNorm(),
            attention,
            dropout_,
        ),
        tl.Residual(feed_forward),
    ]
예제 #4
0
파일: rezero.py 프로젝트: stephenjfox/trax
def _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode,
                  ff_activation):
    """Returns a list of layers that implements a Transformer encoder block.

  The input to the layer is a pair, (activations, mask), where the mask was
  created from the original source tokens to prevent attending to the padding
  part of the input.

  Args:
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    dropout_shared_axes: axes on which to share dropout mask
    mode: str: 'train' or 'eval'
    ff_activation: the non-linearity in feed-forward layer

  Returns:
    A list of layers that maps (activations, mask) to (activations, mask).
  """
    attention = tl.Attention(d_model,
                             n_heads=n_heads,
                             dropout=dropout,
                             mode=mode)

    feed_forward = _FeedForwardBlock(d_model, d_ff, dropout,
                                     dropout_shared_axes, mode, ff_activation)

    dropout_ = tl.Dropout(rate=dropout,
                          shared_axes=dropout_shared_axes,
                          mode=mode)

    return [
        ResidualZero(
            tl.LayerNorm(),
            attention,
            dropout_,
        ),
        ResidualZero(
            tl.LayerNorm(),
            feed_forward,
            dropout_,
        ),
    ]
예제 #5
0
파일: reformer.py 프로젝트: qsays/trax
def EncoderBlock(d_model, d_ff, n_heads, dropout, ff_activation, mode):
    """Returns a list of layers that implements a Reformer encoder block.

  The input to the layer is a pair, (activations, mask), where the mask was
  created from the original source tokens to prevent attending to the padding
  part of the input.

  Args:
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    ff_activation: the non-linearity in feed-forward layer
    mode: str: 'train' or 'eval'

  Returns:
    A list of layers that maps (activations, mask) to (activations, mask).
  """
    pre_attention = tl.LayerNorm()
    attention = tl.Attention(d_model,
                             n_heads=n_heads,
                             dropout=dropout,
                             mode=mode)
    post_attention = tl.Dropout(rate=dropout,
                                name='dropout_enc_attn',
                                mode=mode)

    # TODO(kitaev): Switch to FeedForward with BroadcastedDropout?
    feed_forward = transformer._FeedForwardBlock(  # pylint: disable=protected-access
        d_model, d_ff, dropout, -1, mode, ff_activation)
    # feed_forward = FeedForward(d_model, d_ff, dropout, ff_activation, mode)

    return [
        # TODO(kitaev): consider ReversibleAttentionHalfResidual for efficiency
        ReversibleHalfResidual([pre_attention, attention, post_attention]),
        tl.ReversibleSwap(),
        ReversibleHalfResidual(feed_forward),
        tl.ReversibleSwap(),
    ]
예제 #6
0
 def _Attention():
     return tl.Attention(d_model,
                         n_heads=n_heads,
                         dropout=dropout,
                         mode=mode)
예제 #7
0
파일: qa.py 프로젝트: pererasys/trax-nlp
def EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes,
                  mode, ff_activation, FeedForwardBlock=FeedForwardBlock):
    """
    Returns a list of layers that implements a Transformer encoder block.
    The input to the layer is a pair, (activations, mask), where the mask was
    created from the original source tokens to prevent attending to the padding
    part of the input.
    
    Args:
        d_model (int): depth of embedding.
        d_ff (int): depth of feed-forward layer.
        n_heads (int): number of attention heads.
        dropout (float): dropout rate (how much to drop out).
        dropout_shared_axes (int): axes on which to share dropout mask.
        mode (str): 'train' or 'eval'.
        ff_activation (function): the non-linearity in feed-forward layer.
        FeedForwardBlock (function): A function that returns the feed forward block.
    Returns:
        list: A list of layers that maps (activations, mask) to (activations, mask).
        
    """
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' WITH YOUR CODE) ###
    
    # Attention block
    attention = tl.Attention( 
        # Use dimension of the model
        d_feature=d_model,
        # Set it equal to number of attention heads
        n_heads=n_heads,
        # Set it equal `dropout`
        dropout=dropout,
        # Set it equal `mode`
        mode=mode
    )
    
    # Call the function `FeedForwardBlock` (implemented before) and pass in the parameters
    feed_forward = FeedForwardBlock( 
        d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation
    )
    
    # Dropout block
    dropout_ = tl.Dropout( 
        # set it equal to `dropout`
        rate=dropout,
        # set it equal to the axes on which to share dropout mask
        shared_axes=dropout_shared_axes,
        # set it equal to `mode`
        mode=mode
    )
    
    encoder_block = [ 
        # add `Residual` layer
        tl.Residual(
            # add norm layer
            tl.LayerNorm(),
            # add attention
            attention,
            # add dropout
            dropout_,
        ),
        # add another `Residual` layer
        tl.Residual(
            # add feed forward
            feed_forward,
        ),
    ]
    
    ### END CODE HERE ###
    
    return encoder_block