예제 #1
0
    def __init__(self,
                 input_size: int,
                 vocab_size: int,
                 token_embedder: Optional[TokenEmbedder] = None,
                 token_pos_embedder: Optional[TokenPosEmbedder] = None,
                 cell: Optional[RNNCellBase] = None,
                 output_layer: Optional[nn.Module] = None,
                 input_time_major: bool = False,
                 output_time_major: bool = False,
                 hparams=None):
        super().__init__(token_embedder, token_pos_embedder,
                         input_time_major, output_time_major, hparams=hparams)

        self._input_size = input_size
        self._vocab_size = vocab_size

        # Make RNN cell
        self._cell = cell or layers.get_rnn_cell(
            input_size, self._hparams.rnn_cell)
        self._beam_search_cell = None

        # Make the output layer
        self._output_layer, _ = _make_output_layer(
            output_layer, self._vocab_size, self._cell.hidden_size,
            self._hparams.output_layer_bias)
예제 #2
0
    def __init__(self,
                 input_size: int,
                 vocab_size: int,
                 cell: Optional[RNNCellBase] = None,
                 output_layer: Optional[nn.Module] = None,
                 input_time_major: bool = False,
                 output_time_major: bool = False,
                 hparams: Optional[HParams] = None):
        super().__init__(input_size, vocab_size, input_time_major,
                         output_time_major, hparams)

        # Make RNN cell
        self._cell = cell or layers.get_rnn_cell(input_size,
                                                 self._hparams.rnn_cell)
        self._beam_search_cell = None

        # Make the output layer
        self._output_layer, _ = _make_output_layer(
            output_layer, self._vocab_size, self._cell.hidden_size,
            self._hparams.output_layer_bias)
예제 #3
0
    def __init__(self,
                 vocab_size: Optional[int] = None,
                 output_layer: Optional[Union[nn.Module, torch.Tensor]] = None,
                 hparams: Optional[HParams] = None):
        super().__init__(
            0,
            vocab_size,  # dummy value for input_size
            input_time_major=False,
            output_time_major=False,
            hparams=hparams)
        self._input_size = self._hparams.dim

        self._output_layer, self._vocab_size = _make_output_layer(
            output_layer, vocab_size, self._input_size,
            self._hparams.output_layer_bias)

        self.self_attns = nn.ModuleList()
        self.self_attn_layer_norm = nn.ModuleList()
        self.enc_dec_attns = nn.ModuleList()
        self.end_dec_attn_layer_norm = nn.ModuleList()
        self.poswise_networks = nn.ModuleList()
        self.poswise_layer_norm = nn.ModuleList()

        if self._hparams.use_gpt_config:
            eps = 1e-5
        else:
            eps = 1e-12

        for _ in range(self._hparams.num_blocks):
            attn_module = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            if self._hparams.dim != attn_module.output_size:
                raise ValueError("The output dimension of "
                                 "MultiheadEncoder should be equal "
                                 "to the dim of TransformerDecoder")
            self.self_attns.append(attn_module)
            self.self_attn_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))

            attn_module = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            if self._hparams.dim != attn_module.output_size:
                raise ValueError("The output dimension of "
                                 "MultiheadEncoder should be equal "
                                 "to the dim of TransformerDecoder")
            self.enc_dec_attns.append(attn_module)
            self.end_dec_attn_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))

            poswise_network = FeedForwardNetwork(
                hparams=self._hparams.poswise_feedforward)
            if (poswise_network.hparams.layers[-1]['kwargs']['out_features'] !=
                    self._hparams.dim):
                raise ValueError("The output dimension of "
                                 "FeedForwardNetwork should be equal "
                                 "to the dim of TransformerDecoder")
            self.poswise_networks.append(poswise_network)
            self.poswise_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))

        self.final_layer_norm = nn.LayerNorm(self._input_size, eps=eps)
        self.embed_dropout = nn.Dropout(self._hparams.embedding_dropout)
        self.residual_dropout = nn.Dropout(self._hparams.residual_dropout)

        if self._hparams.initializer:
            # TODO: This might be different to what TensorFlow does
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            # Do not re-initialize LayerNorm modules.
            for name, param in self.named_parameters():
                if name.split(
                        ".")[-1] == "weight" and "layer_norm" not in name:
                    initialize(param)