def test_feedforward(self): """Tests feed-forward. """ hparams = { "layers": [{ "type": "torch.nn.Linear", "kwargs": { "in_features": 32, "out_features": 64 } }, { "type": "torch.nn.Linear", "kwargs": { "in_features": 64, "out_features": 128 } }] } nn = FeedForwardNetwork(hparams=hparams) self.assertEqual(len(nn.layers), len(hparams["layers"])) outputs = nn(torch.ones(64, 16, 32)) self.assertEqual(len(nn.trainable_variables), len(hparams["layers"]) * 2) self.assertEqual(outputs.size(-1), nn.output_size)
def initialize_blocks(self): r""" Helper function to initialize blocks. """ for i in range(self._hparams.num_blocks): mh_attn = MultiheadRPRAttention( self._input_size, self._hparams.multihead_attention, stores_relative_position=bool(i == 0) ) self.self_attns.append(mh_attn) self.self_attn_layer_norm.append( T5LayerNorm(self._input_size, eps=self._hparams.eps)) if self._hparams.dim != mh_attn.hparams.output_dim: raise ValueError( 'The "dim" in the hparams of ' '"multihead_attention" should be equal to the ' '"dim" of T5Encoder') pw_net = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) final_dim = pw_net.hparams.layers[-1]['kwargs']['out_features'] if self._hparams.dim != final_dim: raise ValueError( 'The output dimenstion of ' '"poswise_feedforward" should be equal ' 'to the "dim" of T5Encoder.') self.poswise_networks.append(pw_net) self.poswise_layer_norm.append( T5LayerNorm(self._input_size, eps=self._hparams.eps))
def initialize_blocks(self): r"""Helper function which initializes blocks for encoder. Should be overridden by any classes where block initialization varies. """ for _ in range(self._hparams.num_blocks): mh_attn = MultiheadAttentionEncoder( self._input_size, self._hparams.multihead_attention) self.self_attns.append(mh_attn) if not self._hparams.use_bert_config: self.self_attn_layer_norm.append( nn.LayerNorm(self._input_size, eps=self._hparams.eps)) if self._hparams.dim != mh_attn.hparams.output_dim: raise ValueError( 'The "dim" in the hparams of ' '"multihead_attention" should be equal to the ' '"dim" of TransformerEncoder') pw_net = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) final_dim = pw_net.hparams.layers[-1]['kwargs']['out_features'] if self._hparams.dim != final_dim: raise ValueError('The output dimenstion of ' '"poswise_feedforward" should be equal ' 'to the "dim" of TransformerEncoder.') self.poswise_networks.append(pw_net) self.poswise_layer_norm.append( nn.LayerNorm(self._input_size, eps=self._hparams.eps)) if self._hparams.use_bert_config: self.output_layer_norm.append( nn.LayerNorm(self._input_size, eps=self._hparams.eps))
def test_update_params(self): nn_module = FeedForwardNetwork(hparams=self.hparams) nn_magic_model = MetaModule(nn_module) old_num_buffer_param, old_num_module_param = \ self.recursive_module_param(nn_magic_model, [], []) grads = { name: torch.zeros_like(param) for name, param in nn_module.named_parameters() } nn_magic_model.update_params(grads) new_num_buffer_param, new_num_module_param = \ self.recursive_module_param(nn_magic_model, [], []) self.assertEqual(old_num_module_param, new_num_module_param) self.assertEqual(old_num_buffer_param, new_num_buffer_param)
def initialize_blocks(self): r"""Helper function to initialize blocks. """ for i in range(self._hparams.num_blocks): attn_module = MultiheadRPRAttention( self._input_size, self._hparams.multihead_attention, stores_relative_position=bool(i == 0)) if self._hparams.dim != attn_module.output_size: raise ValueError("The output dimension of " "MultiheadRPRAttention should be equal " "to the dim of T5Decoder") self.self_attns.append(attn_module) self.self_attn_layer_norm.append( T5LayerNorm(self._input_size, eps=self._hparams.eps)) attn_module = MultiheadRPRAttention( self._input_size, self._hparams.multihead_attention, stores_relative_position=bool(i == 0)) if self._hparams.dim != attn_module.output_size: raise ValueError("The output dimension of " "MultiheadRPRAttention should be equal " "to the dim of T5Decoder") self.enc_dec_attns.append(attn_module) self.end_dec_attn_layer_norm.append( T5LayerNorm(self._input_size, eps=self._hparams.eps)) poswise_network = FeedForwardNetwork( hparams=self._hparams.poswise_feedforward) if (poswise_network.hparams.layers[-1]['kwargs']['out_features'] != self._hparams.dim): raise ValueError("The output dimension of " "FeedForwardNetwork should be equal " "to the dim of T5Decoder") self.poswise_networks.append(poswise_network) self.poswise_layer_norm.append( T5LayerNorm(self._input_size, eps=self._hparams.eps))
def initialize_blocks(self): r"""Helper function which initializes blocks for decoder. Should be overridden by any classes where block initialization varies. """ for _ in range(self._hparams.num_blocks): attn_module = MultiheadAttentionEncoder( self._input_size, self._hparams.multihead_attention) if self._hparams.dim != attn_module.output_size: raise ValueError("The output dimension of " "MultiheadEncoder should be equal " "to the dim of TransformerDecoder") self.self_attns.append(attn_module) self.self_attn_layer_norm.append( nn.LayerNorm(self._input_size, eps=self._hparams.eps)) attn_module = MultiheadAttentionEncoder( self._input_size, self._hparams.multihead_attention) if self._hparams.dim != attn_module.output_size: raise ValueError("The output dimension of " "MultiheadEncoder should be equal " "to the dim of TransformerDecoder") self.enc_dec_attns.append(attn_module) self.end_dec_attn_layer_norm.append( nn.LayerNorm(self._input_size, eps=self._hparams.eps)) poswise_network = FeedForwardNetwork( hparams=self._hparams.poswise_feedforward) if (poswise_network.hparams.layers[-1]['kwargs']['out_features'] != self._hparams.dim): raise ValueError("The output dimension of " "FeedForwardNetwork should be equal " "to the dim of TransformerDecoder") self.poswise_networks.append(poswise_network) self.poswise_layer_norm.append( nn.LayerNorm(self._input_size, eps=self._hparams.eps))
def __init__(self, token_embedder: Optional[TokenEmbedder] = None, token_pos_embedder: Optional[TokenPosEmbedder] = None, vocab_size: Optional[int] = None, output_layer: Optional[Union[nn.Module, torch.Tensor]] = None, hparams=None): super().__init__(token_embedder, token_pos_embedder, input_time_major=False, output_time_major=False, hparams=hparams) if token_pos_embedder is None and token_embedder is not None: warnings.warn( "Transformer models cannot capture positional information if " "no positional embedding is provided.") self._input_size = self._hparams.dim self._output_layer, self._vocab_size = _make_output_layer( output_layer, vocab_size, self._input_size, self._hparams.output_layer_bias) self.self_attns = nn.ModuleList() self.self_attn_layer_norm = nn.ModuleList() self.enc_dec_attns = nn.ModuleList() self.end_dec_attn_layer_norm = nn.ModuleList() self.poswise_networks = nn.ModuleList() self.poswise_layer_norm = nn.ModuleList() if self._hparams.use_gpt_config: eps = 1e-5 else: eps = 1e-12 for _ in range(self._hparams.num_blocks): attn_module = MultiheadAttentionEncoder( self._input_size, self._hparams.multihead_attention) if self._hparams.dim != attn_module.output_size: raise ValueError("The output dimension of " "MultiheadEncoder should be equal " "to the dim of TransformerDecoder") self.self_attns.append(attn_module) self.self_attn_layer_norm.append( nn.LayerNorm(self._input_size, eps=eps)) attn_module = MultiheadAttentionEncoder( self._input_size, self._hparams.multihead_attention) if self._hparams.dim != attn_module.output_size: raise ValueError("The output dimension of " "MultiheadEncoder should be equal " "to the dim of TransformerDecoder") self.enc_dec_attns.append(attn_module) self.end_dec_attn_layer_norm.append( nn.LayerNorm(self._input_size, eps=eps)) poswise_network = FeedForwardNetwork( hparams=self._hparams.poswise_feedforward) if (poswise_network.hparams.layers[-1]['kwargs']['out_features'] != self._hparams.dim): raise ValueError("The output dimension of " "FeedForwardNetwork should be equal " "to the dim of TransformerDecoder") self.poswise_networks.append(poswise_network) self.poswise_layer_norm.append( nn.LayerNorm(self._input_size, eps=eps)) self.final_layer_norm = nn.LayerNorm(self._input_size, eps=eps) self.embed_dropout = nn.Dropout(self._hparams.embedding_dropout) self.residual_dropout = nn.Dropout(self._hparams.residual_dropout) if self._hparams.initializer: # TODO: This might be different to what TensorFlow does initialize = layers.get_initializer(self._hparams.initializer) assert initialize is not None # Do not re-initialize LayerNorm modules. for name, param in self.named_parameters(): if name.split( ".")[-1] == "weight" and "layer_norm" not in name: initialize(param)
def test_forward_with_nn_module(self): nn_module = FeedForwardNetwork(hparams=self.hparams) nn_magic_model = MetaModule(nn_module) outputs = nn_magic_model(torch.ones(64, 16, 32)) self.assertEqual(outputs.size(-1), nn_module.output_size)
def __init__(self, hparams=None): super().__init__(hparams=hparams) self._input_size = self._hparams.dim self.self_attns = nn.ModuleList() if not self._hparams.use_bert_config: self.self_attn_layer_norm = nn.ModuleList() self.poswise_networks = nn.ModuleList() self.poswise_layer_norm = nn.ModuleList() self.output_layer_norm = nn.ModuleList() if self._hparams.use_bert_config: # In TensorFlow, eps for LayerNorm is 1e-12 by default. eps = 1e-12 else: # In PyTorch, eps for LayerNorm is 1e-6 by default. eps = 1e-6 for _ in range(self._hparams.num_blocks): mh_attn = MultiheadAttentionEncoder( self._input_size, self._hparams.multihead_attention) self.self_attns.append(mh_attn) if not self._hparams.use_bert_config: self.self_attn_layer_norm.append( nn.LayerNorm(self._input_size, eps=eps)) if self._hparams.dim != mh_attn.hparams.output_dim: raise ValueError( 'The "dim" in the hparams of ' '"multihead_attention" should be equal to the ' '"dim" of TransformerEncoder') pw_net = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) final_dim = pw_net.hparams.layers[-1]['kwargs']['out_features'] if self._hparams.dim != final_dim: raise ValueError('The output dimenstion of ' '"poswise_feedforward" should be equal ' 'to the "dim" of TransformerEncoder.') self.poswise_networks.append(pw_net) self.poswise_layer_norm.append( nn.LayerNorm(self._input_size, eps=eps)) if self._hparams.use_bert_config: self.output_layer_norm.append( nn.LayerNorm(self._input_size, eps=eps)) self.embed_dropout = nn.Dropout(p=self._hparams.embedding_dropout) self.residual_dropout = nn.Dropout(p=self._hparams.residual_dropout) if self._hparams.use_bert_config: self.input_normalizer = nn.LayerNorm(self._input_size, eps=eps) else: self.final_layer_norm = nn.LayerNorm(self._input_size, eps=eps) if self._hparams.initializer: initialize = layers.get_initializer(self._hparams.initializer) assert initialize is not None # Do not re-initialize LayerNorm modules. for name, param in self.named_parameters(): if name.split( '.')[-1] == 'weight' and 'layer_norm' not in name: initialize(param)