def test_names(self): layer = tl.LSTM(3) self.assertEqual('LSTM_3', str(layer)) layer = tl.GRU(5) self.assertEqual('GRU_5', str(layer)) layer = tl.SRU(7) self.assertEqual('SRU_7', str(layer))
def test_sru(self, backend): with fastmath.use_backend(backend): layer = tl.SRU(7) x = np.ones((8, 9, 7), np.float32) _, _ = layer.init(shapes.signature(x)) y = layer(x) self.assertEqual(y.shape, x.shape)
def test_names(self, backend): with fastmath.use_backend(backend): layer = tl.LSTM(3) self.assertEqual('LSTM_3', str(layer)) layer = tl.GRU(5) self.assertEqual('GRU_5', str(layer)) layer = tl.SRU(7) self.assertEqual('SRU_7', str(layer))
def FeedForwardWithOptions(d_model, d_ff, dropout, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, mode): """Feed-Forward block with all the options.""" if ff_use_sru: return [tl.SRU(d_model) for _ in range(ff_use_sru)] elif ff_sparsity: return [tl.LayerNorm(), tl.SparseFF(d_ff, n_elements_in_block=ff_sparsity, d_lowrank=d_ff // ff_sparsity, mode=mode), tl.Dropout(rate=dropout, shared_axes=[-2], mode=mode)] else: return [ChunkedFeedForward(d_model, d_ff, dropout, ff_activation, ff_dropout, ff_chunk_size, mode)]
def DecoderBlock(d_model, d_ff, d_attention_key, d_attention_value, n_heads, attention_type, dropout, ff_activation, ff_use_sru, ff_chunk_size, mode): """Reversible transformer decoder layer. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer d_attention_key: int: depth of key vector for each attention head d_attention_value: int: depth of value vector for each attention head n_heads: int: number of attention heads attention_type: subclass of tl.BaseCausalAttention: attention class to use dropout: float: dropout rate (how much to drop out) ff_activation: the non-linearity in feed-forward layer ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks mode: str: 'train' or 'eval' Returns: the layer. """ attention = attention_type(n_heads=n_heads, d_qk=d_attention_key, d_v=d_attention_value, causal=True, output_dropout=dropout, mode=mode) attention_half_residual = ReversibleHalfResidualV2( tl.LayerNorm(), attention_layer=attention, ) if ff_use_sru: feed_forward = [tl.SRU(d_model) for _ in range(ff_use_sru)] else: feed_forward = [ ChunkedFeedForward(d_model, d_ff, dropout, ff_activation, dropout, ff_chunk_size, mode) ] return [ attention_half_residual, tl.ReversibleSwap(), ReversibleHalfResidual(feed_forward), tl.ReversibleSwap(), ]
def ReformerShortenLM(vocab_size, shorten_factor=1, d_embedding=256, d_model=512, d_ff=2048, d_attention_key=64, d_attention_value=64, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, n_attention_chunks=1, attention_type=tl.DotProductCausalAttention, share_qk=False, axial_pos_shape=(), d_axial_pos_embs=None, ff_activation=tl.FastGelu, ff_use_sru=0, ff_chunk_size=0, mode='train'): """Reversible transformer language model with shortening. When shorten_factor is F and processing an input of shape [batch, length], we embed the (shifted-right) input and then group each F elements (on length) into a single vector -- so that in the end we process a tensor of shape [batch, length // F, d_model] almost until the end -- at the end it's un-shortend and a SRU is applied. This reduces the length processed inside the main model body, effectively making the model faster but possibly slightly less accurate. Args: vocab_size: int: vocab size shorten_factor: by how much to shorten, see above d_embedding: the depth of the embedding layer and final logits d_model: int: depth of *each half* of the two-part features d_ff: int: depth of feed-forward layer d_attention_key: int: depth of key vector for each attention head d_attention_value: int: depth of value vector for each attention head n_layers: int: number of decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding n_attention_chunks: int: number of chunks for attention attention_type: class: attention class to use, such as DotProductAttention. share_qk: bool, whether to share queries and keys. axial_pos_shape: tuple of ints: input shape to use for the axial position encoding. If unset, axial position encoding is disabled. d_axial_pos_embs: tuple of ints: depth of position embedding for each axis. Tuple length must match axial_pos_shape, values must sum to d_embedding. ff_activation: the non-linearity in feed-forward layer ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks mode: str: 'train' or 'eval' Returns: the layer. """ assert mode != 'predict' # TODO(lukaszkaiser,kitaev): fast inference if not axial_pos_shape: positional_encoding = tl.PositionalEncoding( max_len=max_len, dropout=dropout, mode=mode) else: assert d_axial_pos_embs is not None positional_encoding = tl.AxialPositionalEncoding( shape=axial_pos_shape, d_embs=d_axial_pos_embs, dropout_broadcast_dims=tuple(range(1, len(axial_pos_shape) + 1)), dropout=dropout, mode=mode) positional_embedder = [ tl.Embedding(d_embedding, vocab_size), BroadcastedDropout(rate=dropout, mode=mode), # pylint: disable=no-value-for-parameter positional_encoding, ] decoder_blocks = [] if isinstance(attention_type, (tuple, list)): assert n_layers % len(attention_type) == 0 else: attention_type = [attention_type] for layer_idx in range(n_layers): layer_attention_type = attention_type[layer_idx % len(attention_type)] decoder_block = DecoderBlock( d_model, d_ff, d_attention_key, d_attention_value, n_heads, n_attention_chunks, attention_type=layer_attention_type, dropout=dropout, share_qk=(share_qk or issubclass(layer_attention_type, tl.LSHCausalAttention)), ff_activation=ff_activation, ff_use_sru=ff_use_sru, ff_chunk_size=ff_chunk_size, mode=mode) decoder_blocks.append(decoder_block) # pylint: disable=g-long-lambda return tl.Serial( tl.ShiftRight(), positional_embedder, tl.Dup(), # Stack has (x, x), the first will be shortened # Before shortening, we need to pad by shorten factor so as not to leak # information into the future. To understand why, imagine shorten factor # of 2 and sequence of length 4, so ABCD. If we shift just by 1, then we # would have 0ABC, which gets grouped to [0A][BC] on input, which is # predicting ABCD as targets. The problem is that [0A] has access to A # and [BC] has access to C -- it will learn to copy it, peek into # the future. Shifting twice to [00][AB] solves the problem as the first # "big" symbol becomes all-0 and the rest is shifted enough. tl.ShiftRight(n_shifts=shorten_factor - 1), tl.Fn(lambda x: np.reshape( # Shorten -- move to depth. x, (x.shape[0], x.shape[1] // shorten_factor, -1)), n_out=1), tl.Dense(d_model), tl.Dup(), # Stack has (short_x, short_x, x) tl.ReversibleSerial(decoder_blocks), tl.Select([0], n_in=2), tl.LayerNorm(), BroadcastedDropout(rate=dropout, mode=mode), # pylint: disable=no-value-for-parameter tl.Dense(shorten_factor * d_embedding), tl.Fn(lambda x: np.reshape( # Prolong back. x, (x.shape[0], x.shape[1] * shorten_factor, -1)), n_out=1), tl.Concatenate(), # Concatenate with just the embeddings. tl.CausalConv(d_embedding), tl.Relu(), tl.SRU(d_embedding), # One RNN layer for conditional dependence. tl.Dense(vocab_size), tl.LogSoftmax() )
def DecoderBlock(d_model, d_ff, d_attention_key, d_attention_value, n_heads, n_attention_chunks, attention_type, dropout, share_qk, ff_activation, ff_use_sru, ff_chunk_size, mode): """Reversible transformer decoder layer. Args: d_model: int: depth of embedding d_ff: int: depth of feed-forward layer d_attention_key: int: depth of key vector for each attention head d_attention_value: int: depth of value vector for each attention head n_heads: int: number of attention heads n_attention_chunks: int: number of chunks for attention attention_type: subclass of tl.BaseCausalAttention: attention class to use dropout: float: dropout rate (how much to drop out) share_qk: string, whether to share queries and keys ff_activation: the non-linearity in feed-forward layer ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks mode: str: 'train' or 'eval' Returns: the layer. """ if not hasattr(attention_type, 'forward_unbatched'): if share_qk: pre_attention = [ Chunk(n_sections=n_attention_chunks), # pylint: disable=no-value-for-parameter tl.LayerNorm(), tl.Dup(), tl.Parallel( tl.ComputeAttentionHeads(n_heads=n_heads, d_head=d_attention_key), tl.ComputeAttentionHeads( n_heads=n_heads, d_head=d_attention_value), ), tl.Dup(), ] else: pre_attention = [ Chunk(n_sections=n_attention_chunks), # pylint: disable=no-value-for-parameter tl.LayerNorm(), tl.Dup(), tl.Dup(), tl.Parallel( tl.ComputeAttentionHeads(n_heads=n_heads, d_head=d_attention_key), tl.ComputeAttentionHeads(n_heads=n_heads, d_head=d_attention_key), tl.ComputeAttentionHeads( n_heads=n_heads, d_head=d_attention_value), ), ] attention = attention_type(mode=mode) # ReversibleAttentionHalfResidual requires that post_attention be linear in # its input (so the backward pass can be computed without knowing the input) post_attention = [ tl.ComputeAttentionOutput(n_heads=n_heads, d_model=d_model), Unchunk(n_sections=n_attention_chunks), # pylint: disable=no-value-for-parameter BroadcastedDropout(rate=dropout, mode=mode), # pylint: disable=no-value-for-parameter ] attention_half_residual = ReversibleAttentionHalfResidual( pre_attention, attention, post_attention) else: attention = attention_type( n_heads=n_heads, d_qk=d_attention_key, d_v=d_attention_value, share_qk=share_qk, causal=True, output_dropout=dropout, mode=mode) attention_half_residual = ReversibleHalfResidualV2( tl.LayerNorm(), attention_layer=attention, ) if ff_use_sru: feed_forward = [tl.SRU(d_model) for _ in range(ff_use_sru)] else: feed_forward = [ChunkedFeedForward(d_model, d_ff, dropout, ff_activation, dropout, ff_chunk_size, mode)] return [ attention_half_residual, tl.ReversibleSwap(), ReversibleHalfResidual(feed_forward), tl.ReversibleSwap(), ]
def FeedForwardWithOptions(d_model, d_ff, dropout, dropout_shared_axes, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, mode, use_bfloat16=False, ff_sparsity_type='1inN'): """Feed-Forward block with all the options. Args: d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each block. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. ff_activation: Type of activation function at the end of each block; must be an activation-type subclass of `Layer`. ff_dropout: Stochastic rate (probability) for dropping an activation value when applying dropout after the FF dense layer. ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_use_sru: int or pair of ints; if > 0, we use this many SRU layers in addition to the feed-forward block (second int specifies sru size) ff_sparsity: int, tuple or string; if not 0, use sparse feed-forward block with this sparsity mode: If `'train'`, each block will include dropout; else, it will pass all values through unaltered. use_bfloat16: whether to use bfloat16 for weights (default: False). ff_sparsity_type: string, if ff_sparsity >0, use SparseFF if ff_sparsity_type=`'1inN'` and use BlockSparseFF if ff_sparsity_type=`'Block'` use SwitchSparseFF if ff_sparsity_type=`'Switch'` Returns: A list of layers which maps vectors to vectors. """ if ff_sparsity and ff_sparsity_type == '1inN': temperature, quant_prob = 0.1, 0.3 if isinstance(ff_sparsity, str): # This is hacky but used to pass ff_sparsity in yaml sweep files. ff_sparsity = [(float(x) if '.' in x else int(x)) for x in ff_sparsity.split()] if isinstance(ff_sparsity, (list, tuple)): if len(ff_sparsity) == 2: n_elements_in_block, d_lowrank = ff_sparsity else: n_elements_in_block, d_lowrank, temperature, quant_prob = ff_sparsity else: assert isinstance(ff_sparsity, int) n_elements_in_block, d_lowrank = ff_sparsity, d_ff // ff_sparsity ff = tl.SparseFF(d_ff, n_elements_in_block=n_elements_in_block, d_lowrank=d_lowrank, temperature=temperature, quant_prob=quant_prob, use_bfloat16=use_bfloat16, mode=mode, dropout_rate=dropout, dropout_shared_axes=dropout_shared_axes, ff_chunk_size=ff_chunk_size) elif ff_sparsity and ff_sparsity_type == 'Block': ff = tl.BlockSparseFF(d_ff, n_experts=ff_sparsity, mode=mode) elif ff_sparsity and ff_sparsity_type == 'Switch': ff = tl.SwitchSparseFF(d_ff, n_experts=ff_sparsity, mode=mode) else: ff = _FeedForward(d_model, d_ff, dropout, ff_activation, ff_dropout, use_bfloat16, mode) res = [tl.LayerNorm(), ff] if ff_sparsity_type != '1inN' or ff_sparsity == 0: # SparseFF has Dropout and BatchLeadingAxes built-in. res.append( tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode)) if ff_chunk_size > 0: res = tl.BatchLeadingAxes(tl.Chunk(tl.Serial(res), ff_chunk_size)) if ff_use_sru: if isinstance(ff_use_sru, (list, tuple)): sru_n_layers, sru_n_units = ff_use_sru else: sru_n_layers, sru_n_units = ff_use_sru, 32 sru = [tl.SRU(sru_n_units, mode=mode) for _ in range(sru_n_layers)] block = [tl.LayerNorm(), tl.Dense(sru_n_units) ] + sru + [tl.Dense(d_model)] res = tl.Residual(block, shortcut=res) return [res]
def FeedForwardWithOptions(d_model, d_ff, dropout, dropout_shared_axes, ff_activation, ff_dropout, ff_chunk_size, ff_use_sru, ff_sparsity, mode, ff_sparsity_type='1inN'): """Feed-Forward block with all the options. Args: d_model: Final dimension of tensors at most points in the model, including the initial embedding output. d_ff: Size of special dense layer in the feed-forward part of each block. dropout: Stochastic rate (probability) for dropping an activation value when applying dropout within a block. dropout_shared_axes: Tensor axes on which to share a dropout mask. Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is a useful way to save memory and apply consistent masks to activation vectors at different sequence positions. ff_activation: Type of activation function at the end of each block; must be an activation-type subclass of `Layer`. ff_dropout: Stochastic rate (probability) for dropping an activation value when applying dropout after the FF dense layer. ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity mode: If `'train'`, each block will include dropout; else, it will pass all values through unaltered. ff_sparsity_type: string, if ff_sparsity >0, use SparseFF if ff_sparsity_type=`'1inN'` and use BlockSparseFF if ff_sparsity_type=`'Block'` Returns: A list of layers which maps vectors to vectors. """ if ff_use_sru: return [tl.SRU(d_model) for _ in range(ff_use_sru)] elif ff_sparsity and ff_sparsity_type == '1inN': ff = tl.SparseFF(d_ff, n_elements_in_block=ff_sparsity, d_lowrank=d_ff // ff_sparsity, mode=mode) if ff_chunk_size < 1: chunked_ff = ff else: chunked_ff = tl.BatchLeadingAxes( tl.Chunk(tl.Serial(ff), ff_chunk_size)) return [ tl.LayerNorm(), chunked_ff, tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) ] elif ff_sparsity and ff_sparsity_type == 'Block': return [ tl.LayerNorm(), tl.BlockSparseFF(d_ff, num_experts=ff_sparsity, mode=mode), tl.Dropout(rate=dropout, shared_axes=dropout_shared_axes, mode=mode) ] else: return [ ChunkedFeedForward(d_model, d_ff, dropout, ff_activation, ff_dropout, ff_chunk_size, mode) ]
def test_sru(self): layer = tl.SRU(7) x = np.ones((8, 9, 7)) _, _ = layer.init(shapes.signature(x)) y = layer(x) self.assertEqual(y.shape, x.shape)