Пример #1
0
    def test_feedforward(self):
        """Tests feed-forward.
        """
        hparams = {
            "layers": [{
                "type": "torch.nn.Linear",
                "kwargs": {
                    "in_features": 32,
                    "out_features": 64
                }
            }, {
                "type": "torch.nn.Linear",
                "kwargs": {
                    "in_features": 64,
                    "out_features": 128
                }
            }]
        }

        nn = FeedForwardNetwork(hparams=hparams)

        self.assertEqual(len(nn.layers), len(hparams["layers"]))
        outputs = nn(torch.ones(64, 16, 32))
        self.assertEqual(len(nn.trainable_variables),
                         len(hparams["layers"]) * 2)
        self.assertEqual(outputs.size(-1), nn.output_size)
Пример #2
0
    def initialize_blocks(self):
        r""" Helper function to initialize blocks.

        """
        for i in range(self._hparams.num_blocks):
            mh_attn = MultiheadRPRAttention(
                self._input_size,
                self._hparams.multihead_attention,
                stores_relative_position=bool(i == 0)
            )
            self.self_attns.append(mh_attn)

            self.self_attn_layer_norm.append(
                T5LayerNorm(self._input_size, eps=self._hparams.eps))
            if self._hparams.dim != mh_attn.hparams.output_dim:
                raise ValueError(
                    'The "dim" in the hparams of '
                    '"multihead_attention" should be equal to the '
                    '"dim" of T5Encoder')

            pw_net = FeedForwardNetwork(
                hparams=self._hparams['poswise_feedforward'])

            final_dim = pw_net.hparams.layers[-1]['kwargs']['out_features']
            if self._hparams.dim != final_dim:
                raise ValueError(
                    'The output dimenstion of '
                    '"poswise_feedforward" should be equal '
                    'to the "dim" of T5Encoder.')

            self.poswise_networks.append(pw_net)
            self.poswise_layer_norm.append(
                T5LayerNorm(self._input_size, eps=self._hparams.eps))
Пример #3
0
    def initialize_blocks(self):
        r"""Helper function which initializes blocks for encoder.

        Should be overridden by any classes where block initialization varies.
        """
        for _ in range(self._hparams.num_blocks):
            mh_attn = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            self.self_attns.append(mh_attn)
            if not self._hparams.use_bert_config:
                self.self_attn_layer_norm.append(
                    nn.LayerNorm(self._input_size, eps=self._hparams.eps))
            if self._hparams.dim != mh_attn.hparams.output_dim:
                raise ValueError(
                    'The "dim" in the hparams of '
                    '"multihead_attention" should be equal to the '
                    '"dim" of TransformerEncoder')

            pw_net = FeedForwardNetwork(
                hparams=self._hparams['poswise_feedforward'])

            final_dim = pw_net.hparams.layers[-1]['kwargs']['out_features']
            if self._hparams.dim != final_dim:
                raise ValueError('The output dimenstion of '
                                 '"poswise_feedforward" should be equal '
                                 'to the "dim" of TransformerEncoder.')

            self.poswise_networks.append(pw_net)
            self.poswise_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=self._hparams.eps))
            if self._hparams.use_bert_config:
                self.output_layer_norm.append(
                    nn.LayerNorm(self._input_size, eps=self._hparams.eps))
Пример #4
0
    def test_update_params(self):
        nn_module = FeedForwardNetwork(hparams=self.hparams)
        nn_magic_model = MetaModule(nn_module)

        old_num_buffer_param, old_num_module_param = \
            self.recursive_module_param(nn_magic_model, [], [])

        grads = {
            name: torch.zeros_like(param)
            for name, param in nn_module.named_parameters()
        }
        nn_magic_model.update_params(grads)

        new_num_buffer_param, new_num_module_param = \
            self.recursive_module_param(nn_magic_model, [], [])

        self.assertEqual(old_num_module_param, new_num_module_param)
        self.assertEqual(old_num_buffer_param, new_num_buffer_param)
Пример #5
0
    def initialize_blocks(self):
        r"""Helper function to initialize blocks.
        """
        for i in range(self._hparams.num_blocks):
            attn_module = MultiheadRPRAttention(
                self._input_size,
                self._hparams.multihead_attention,
                stores_relative_position=bool(i == 0))
            if self._hparams.dim != attn_module.output_size:
                raise ValueError("The output dimension of "
                                 "MultiheadRPRAttention should be equal "
                                 "to the dim of T5Decoder")
            self.self_attns.append(attn_module)
            self.self_attn_layer_norm.append(
                T5LayerNorm(self._input_size, eps=self._hparams.eps))

            attn_module = MultiheadRPRAttention(
                self._input_size,
                self._hparams.multihead_attention,
                stores_relative_position=bool(i == 0))
            if self._hparams.dim != attn_module.output_size:
                raise ValueError("The output dimension of "
                                 "MultiheadRPRAttention should be equal "
                                 "to the dim of T5Decoder")
            self.enc_dec_attns.append(attn_module)
            self.end_dec_attn_layer_norm.append(
                T5LayerNorm(self._input_size, eps=self._hparams.eps))

            poswise_network = FeedForwardNetwork(
                hparams=self._hparams.poswise_feedforward)
            if (poswise_network.hparams.layers[-1]['kwargs']['out_features'] !=
                    self._hparams.dim):
                raise ValueError("The output dimension of "
                                 "FeedForwardNetwork should be equal "
                                 "to the dim of T5Decoder")
            self.poswise_networks.append(poswise_network)
            self.poswise_layer_norm.append(
                T5LayerNorm(self._input_size, eps=self._hparams.eps))
    def initialize_blocks(self):
        r"""Helper function which initializes blocks for decoder.

        Should be overridden by any classes where block initialization varies.
        """
        for _ in range(self._hparams.num_blocks):
            attn_module = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            if self._hparams.dim != attn_module.output_size:
                raise ValueError("The output dimension of "
                                 "MultiheadEncoder should be equal "
                                 "to the dim of TransformerDecoder")
            self.self_attns.append(attn_module)
            self.self_attn_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=self._hparams.eps))

            attn_module = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            if self._hparams.dim != attn_module.output_size:
                raise ValueError("The output dimension of "
                                 "MultiheadEncoder should be equal "
                                 "to the dim of TransformerDecoder")
            self.enc_dec_attns.append(attn_module)
            self.end_dec_attn_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=self._hparams.eps))

            poswise_network = FeedForwardNetwork(
                hparams=self._hparams.poswise_feedforward)
            if (poswise_network.hparams.layers[-1]['kwargs']['out_features'] !=
                    self._hparams.dim):
                raise ValueError("The output dimension of "
                                 "FeedForwardNetwork should be equal "
                                 "to the dim of TransformerDecoder")
            self.poswise_networks.append(poswise_network)
            self.poswise_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=self._hparams.eps))
    def __init__(self,
                 token_embedder: Optional[TokenEmbedder] = None,
                 token_pos_embedder: Optional[TokenPosEmbedder] = None,
                 vocab_size: Optional[int] = None,
                 output_layer: Optional[Union[nn.Module, torch.Tensor]] = None,
                 hparams=None):
        super().__init__(token_embedder,
                         token_pos_embedder,
                         input_time_major=False,
                         output_time_major=False,
                         hparams=hparams)

        if token_pos_embedder is None and token_embedder is not None:
            warnings.warn(
                "Transformer models cannot capture positional information if "
                "no positional embedding is provided.")

        self._input_size = self._hparams.dim
        self._output_layer, self._vocab_size = _make_output_layer(
            output_layer, vocab_size, self._input_size,
            self._hparams.output_layer_bias)

        self.self_attns = nn.ModuleList()
        self.self_attn_layer_norm = nn.ModuleList()
        self.enc_dec_attns = nn.ModuleList()
        self.end_dec_attn_layer_norm = nn.ModuleList()
        self.poswise_networks = nn.ModuleList()
        self.poswise_layer_norm = nn.ModuleList()

        if self._hparams.use_gpt_config:
            eps = 1e-5
        else:
            eps = 1e-12

        for _ in range(self._hparams.num_blocks):
            attn_module = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            if self._hparams.dim != attn_module.output_size:
                raise ValueError("The output dimension of "
                                 "MultiheadEncoder should be equal "
                                 "to the dim of TransformerDecoder")
            self.self_attns.append(attn_module)
            self.self_attn_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))

            attn_module = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            if self._hparams.dim != attn_module.output_size:
                raise ValueError("The output dimension of "
                                 "MultiheadEncoder should be equal "
                                 "to the dim of TransformerDecoder")
            self.enc_dec_attns.append(attn_module)
            self.end_dec_attn_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))

            poswise_network = FeedForwardNetwork(
                hparams=self._hparams.poswise_feedforward)
            if (poswise_network.hparams.layers[-1]['kwargs']['out_features'] !=
                    self._hparams.dim):
                raise ValueError("The output dimension of "
                                 "FeedForwardNetwork should be equal "
                                 "to the dim of TransformerDecoder")
            self.poswise_networks.append(poswise_network)
            self.poswise_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))

        self.final_layer_norm = nn.LayerNorm(self._input_size, eps=eps)
        self.embed_dropout = nn.Dropout(self._hparams.embedding_dropout)
        self.residual_dropout = nn.Dropout(self._hparams.residual_dropout)

        if self._hparams.initializer:
            # TODO: This might be different to what TensorFlow does
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            # Do not re-initialize LayerNorm modules.
            for name, param in self.named_parameters():
                if name.split(
                        ".")[-1] == "weight" and "layer_norm" not in name:
                    initialize(param)
Пример #8
0
    def test_forward_with_nn_module(self):
        nn_module = FeedForwardNetwork(hparams=self.hparams)
        nn_magic_model = MetaModule(nn_module)

        outputs = nn_magic_model(torch.ones(64, 16, 32))
        self.assertEqual(outputs.size(-1), nn_module.output_size)
    def __init__(self, hparams=None):
        super().__init__(hparams=hparams)
        self._input_size = self._hparams.dim
        self.self_attns = nn.ModuleList()
        if not self._hparams.use_bert_config:
            self.self_attn_layer_norm = nn.ModuleList()
        self.poswise_networks = nn.ModuleList()
        self.poswise_layer_norm = nn.ModuleList()
        self.output_layer_norm = nn.ModuleList()

        if self._hparams.use_bert_config:
            # In TensorFlow, eps for LayerNorm is 1e-12 by default.
            eps = 1e-12
        else:
            # In PyTorch, eps for LayerNorm is 1e-6 by default.
            eps = 1e-6

        for _ in range(self._hparams.num_blocks):
            mh_attn = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            self.self_attns.append(mh_attn)
            if not self._hparams.use_bert_config:
                self.self_attn_layer_norm.append(
                    nn.LayerNorm(self._input_size, eps=eps))
            if self._hparams.dim != mh_attn.hparams.output_dim:
                raise ValueError(
                    'The "dim" in the hparams of '
                    '"multihead_attention" should be equal to the '
                    '"dim" of TransformerEncoder')

            pw_net = FeedForwardNetwork(
                hparams=self._hparams['poswise_feedforward'])

            final_dim = pw_net.hparams.layers[-1]['kwargs']['out_features']
            if self._hparams.dim != final_dim:
                raise ValueError('The output dimenstion of '
                                 '"poswise_feedforward" should be equal '
                                 'to the "dim" of TransformerEncoder.')

            self.poswise_networks.append(pw_net)
            self.poswise_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))
            if self._hparams.use_bert_config:
                self.output_layer_norm.append(
                    nn.LayerNorm(self._input_size, eps=eps))

        self.embed_dropout = nn.Dropout(p=self._hparams.embedding_dropout)
        self.residual_dropout = nn.Dropout(p=self._hparams.residual_dropout)

        if self._hparams.use_bert_config:
            self.input_normalizer = nn.LayerNorm(self._input_size, eps=eps)
        else:
            self.final_layer_norm = nn.LayerNorm(self._input_size, eps=eps)

        if self._hparams.initializer:
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            # Do not re-initialize LayerNorm modules.
            for name, param in self.named_parameters():
                if name.split(
                        '.')[-1] == 'weight' and 'layer_norm' not in name:
                    initialize(param)