def GRULM(vocab_size=256, d_model=512, n_layers=2, mode='train'): """Returns a GRU language model. Args: vocab_size (int, optional): Size of the vocabulary. Defaults to 256. d_model (int, optional): Depth of embedding (n_units in the GRU cell). Defaults to 512. n_layers (int, optional): Number of GRU layers. Defaults to 2. mode (str, optional): 'train', 'eval' or 'predict', predict mode is for fast inference. Defaults to "train". Returns: trax.layers.combinators.Serial: A GRU language model as a layer that maps from a tensor of tokens to activations over a vocab set. """ ### START CODE HERE (Replace instances of 'None' with your code) ### model = tl.Serial( tl.ShiftRight(mode=mode), # Stack the ShiftRight layer tl.Embedding(vocab_size=vocab_size, d_feature=d_model), # Stack the embedding layer [ tl.GRU(n_units=d_model) for i in range(n_layers) ], # Stack GRU layers of d_model units keeping n_layer parameter in mind (use list comprehension syntax) tl.Dense(n_units=vocab_size), # Dense layer tl.LogSoftmax() # Log Softmax ) ### END CODE HERE ### return model
def GRULM(vocab_size=256, d_model=512, n_layers=1, mode='train'): """Returns a GRU (gated recurrent unit) language model. This model performs autoregressive language modeling: - input: rank 2 tensor representing a batch of text strings via token IDs plus padding markers; shape is (batch_size, sequence_length). The tensor elements are integers in `range(vocab_size)`, and `0` values mark padding positions. - output: rank 3 tensor representing a batch of log-probability distributions for each sequence position over possible token IDs; shape is (batch_size, sequence_length, `vocab_size`). Args: vocab_size: Input vocabulary size -- each element of the input tensor should be an integer in `range(vocab_size)`. These integers typically represent token IDs from a vocabulary-based tokenizer. d_model: Embedding depth throughout the model. n_layers: Number of GRU layers. mode: If `'predict'`, use fast inference (and omit the right shift). Returns: A GRU language model as a layer that maps from a tensor of tokens to activations over a vocab set. """ return tl.Serial( tl.ShiftRight(mode=mode), tl.Embedding(vocab_size, d_model), [tl.GRU(d_model) for _ in range(n_layers)], tl.Dense(vocab_size), )
def GRULM(vocab_size=256, d_model=512, n_layers=1, mode='train'): """Returns an GRU language model. The input to the model is a tensor of tokens (ints). Args: vocab_size: int: vocab size d_model: int: depth of embedding (n_units in the RNN cell) n_layers: int: number of RNN layers mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference Returns: An RNN language model as a layer that maps from a tensor of tokens to activations over a vocab set. """ return tl.Serial( tl.ShiftRight(mode=mode), tl.Embedding(d_model, vocab_size), [tl.GRU(d_model) for _ in range(n_layers)], tl.Dense(vocab_size), tl.LogSoftmax() )
def test_names(self): layer = tl.LSTM(3) self.assertEqual('LSTM_3', str(layer)) layer = tl.GRU(5) self.assertEqual('GRU_5', str(layer)) layer = tl.SRU(7) self.assertEqual('SRU_7', str(layer))
def test_names(self, backend): with fastmath.use_backend(backend): layer = tl.LSTM(3) self.assertEqual('LSTM_3', str(layer)) layer = tl.GRU(5) self.assertEqual('GRU_5', str(layer)) layer = tl.SRU(7) self.assertEqual('SRU_7', str(layer))
def test_dimensionality(self): x = np.ones((2, 3, 8)) layer = tl.Bidirectional(tl.GRU(n_units=8)) input_signature = shapes.signature(x) _, _ = layer.init(input_signature) yhat = layer(x) self.assertEqual(yhat.shape, (2, 3, 8 + 8))
# Putting everything together the GRU model will look like this: # In[4]: mode = 'train' vocab_size = 256 model_dimension = 512 n_layers = 2 GRU = tl.Serial( tl.ShiftRight( mode=mode ), # Do remember to pass the mode parameter if you are using it for interence/test as default is train tl.Embedding(vocab_size=vocab_size, d_feature=model_dimension), [ tl.GRU(n_units=model_dimension) for _ in range(n_layers) ], # You can play around n_layers if you want to stack more GRU layers together tl.Dense(n_units=vocab_size), tl.LogSoftmax()) # Next is a helper function that prints information for every layer (sublayer within `Serial`): # # _Try changing the parameters defined before the GRU model and see how it changes!_ # # In[5]: def show_layers(model, layer_prefix="Serial.sublayers"): print(f"Total layers: {len(model.sublayers)}\n") for i in range(len(model.sublayers)):