def _FunnelBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, pool_layer, pool_size, strides, separate_cls): """Internal funnel block. Returns a list of layers implementing it. The input is an activation tensor. Args: d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each block. n_heads: Number of attention heads. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each block; must be an activation-type subclass of `Layer`. pool_layer: Type of pooling layer used for downsampling; should be `tl.AvgPool` or `tl.MaxPool`. pool_size: Shape of window that gets reduced to a single vector value. If the layer inputs are :math:`n`-dimensional arrays, then `pool_size` must be a tuple of length :math:`n-2`. strides: Offsets from the location of one window to the locations of neighboring windows along each axis. If specified, must be a tuple of the same length as `pool_size`. If None, then offsets of 1 along each window axis, :math:`(1, ..., 1)`, will be used. separate_cls: If `True`, pooling in funnel blocks is not applied to embeddings of the first token (`cls` from BERT paper). Returns: A list of layers that maps (activations, mask) to (activations', mask). """ pooling = PoolLayer(pool_layer, pool_size, strides, separate_cls) mask_pooling = MaskPool(pool_size, strides, separate_cls) attention = tl.AttentionQKV(d_model, n_heads=n_heads, dropout=dropout, mode=mode) hidden_dropout = tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) feed_forward = _FeedForwardBlock(d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation) return [ # h, mask tl.LayerNorm(), # h, mask tl.Branch(pooling, None), # h', h, mask tl.Residual( tl.Select([0, 1, 1, 2]), # h', h, h, mask attention, # attn, mask tl.Parallel(None, mask_pooling), # attn, mask' hidden_dropout # attn, mask' ), # funnel_activations, mask' tl.Residual(feed_forward) ]
def _FunnelRelativeDecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, total_pooling, shorten_factor, resampler_fn): """Returns a list of layers that implements a Transformer decoder block. The input is an activation tensor. Args: d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each block. n_heads: Number of attention heads. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each block; must be an activation-type subclass of `Layer`. total_pooling: total pooling. shorten_factor: by how much shorten/upsample at this funnel block. resampler_fn: Type of function that performs funnel upsampling/downsampling; callable with signature: shorten_factor, d_model; must return an activation-type subclass of `Layer`. Returns: A list of layers that maps an activation tensor to an activation tensor. """ resampler = resampler_fn(shorten_factor, d_model) attention = RelativeAttentionLMLayer( d_model, total_pooling, n_heads=n_heads, dropout=dropout, mode=mode) feed_forward = _FeedForwardBlock( d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation) dropout_ = tl.Dropout( rate=dropout, shared_axes=dropout_shared_axes, mode=mode) return [ tl.LayerNorm(), # h tl.Branch(tl.Serial( resampler, tl.LayerNorm(), ), None), # h', h tl.Residual( tl.Select([0, 1, 1]), # h', h, h attention, dropout_, ), tl.Residual( feed_forward ), ]
def _RelativeDecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation, context_bias_layer, location_bias_layer, total_pooling): """Returns a list of layers that implements a Transformer encoder block. The input to the block is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each block. n_heads: Number of attention heads. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. mode: If `'train'`, each block will include dropout; else, it will pass all values through unaltered. ff_activation: Type of activation function at the end of each block; must be an activation-type subclass of `Layer`. context_bias_layer: Global context bias from Transformer XL's attention. location_bias_layer: Global location bias from Transformer XL's attention. total_pooling: The combined pool size of previously used funnel blocks. Returns: A list of layers that maps (activations, att_vecs, mask) to (activations, att_vecs, mask). """ attention = RelativeAttentionLMLayer( d_model, context_bias_layer, location_bias_layer, total_pooling, n_heads=n_heads, dropout=dropout, mode=mode) feed_forward = _FeedForwardBlock( d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation) dropout_ = tl.Dropout( rate=dropout, shared_axes=dropout_shared_axes, mode=mode) return [ tl.Residual( # vecs tl.LayerNorm(), tl.Select([0, 0, 0]), attention, dropout_, ), # vecs tl.Residual( feed_forward ), # vecs ]
def EncoderBlock(d_model, d_ff, n_heads, dropout, ff_activation, mode): """Returns a list of layers that implements a Reformer encoder block. The input to the layer is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) ff_activation: the non-linearity in feed-forward layer mode: str: 'train' or 'eval' Returns: A list of layers that maps (activations, mask) to (activations, mask). """ attention = tl.SelfAttention( n_heads=n_heads, d_qk=d_model//n_heads, d_v=d_model//n_heads, masked=True, attention_dropout=0.0, # TODO(kitaev): attention dropout mode=mode) attention_half_residual = ReversibleHalfResidualV2( tl.LayerNorm(), attention_layer=attention, # TODO(kitaev): add output dropout to attention layer. rate=dropout ) # TODO(kitaev): Switch to FeedForward with BroadcastedDropout? feed_forward = transformer._FeedForwardBlock( # pylint: disable=protected-access d_model, d_ff, dropout, -1, mode, ff_activation) # feed_forward = FeedForward(d_model, d_ff, dropout, ff_activation, mode) return [ attention_half_residual, tl.ReversibleSwap(), ReversibleHalfResidualV2(feed_forward), tl.ReversibleSwap(), ]
def EncoderBlock(d_model, d_ff, n_heads, dropout, ff_activation, mode): """Returns a list of layers that implements a Reformer encoder block. The input to the layer is a pair, (activations, mask), where the mask was created from the original source tokens to prevent attending to the padding part of the input. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) ff_activation: the non-linearity in feed-forward layer mode: str: 'train' or 'eval' Returns: A list of layers that maps (activations, mask) to (activations, mask). """ pre_attention = tl.LayerNorm() attention = tl.Attention(d_model, n_heads=n_heads, dropout=dropout, mode=mode) post_attention = tl.Dropout(rate=dropout, name='dropout_enc_attn', mode=mode) # TODO(kitaev): Switch to FeedForward with BroadcastedDropout? feed_forward = transformer._FeedForwardBlock( # pylint: disable=protected-access d_model, d_ff, dropout, -1, mode, ff_activation) # feed_forward = FeedForward(d_model, d_ff, dropout, ff_activation, mode) return [ # TODO(kitaev): consider ReversibleAttentionHalfResidual for efficiency ReversibleHalfResidual([pre_attention, attention, post_attention]), tl.ReversibleSwap(), ReversibleHalfResidual(feed_forward), tl.ReversibleSwap(), ]