Exemplo n.º 1
0
    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        self_attention_dropout_rate: float = 0.0,
        src_attention_dropout_rate: float = 0.0,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
    ):
        assert check_argument_types()
        super().__init__()
        attention_dim = encoder_output_size

        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(vocab_size, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(vocab_size, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        else:
            raise ValueError(
                f"only 'embed' or 'linear' is supported: {input_layer}")

        self.normalize_before = normalize_before
        self.decoders = repeat(
            num_blocks,
            lambda: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     self_attention_dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
        if use_output_layer:
            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
        else:
            self.output_layer = None
Exemplo n.º 2
0
    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        self_attention_dropout_rate: float = 0.0,
        src_attention_dropout_rate: float = 0.0,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
        conv_wshare: int = 4,
        conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11),
        conv_usebias: int = False,
    ):
        assert check_argument_types()
        if len(conv_kernel_length) != num_blocks:
            raise ValueError(
                "conv_kernel_length must have equal number of values to num_blocks: "
                f"{len(conv_kernel_length)} != {num_blocks}")
        super().__init__(
            vocab_size=vocab_size,
            encoder_output_size=encoder_output_size,
            dropout_rate=dropout_rate,
            positional_dropout_rate=positional_dropout_rate,
            input_layer=input_layer,
            use_output_layer=use_output_layer,
            pos_enc_class=pos_enc_class,
            normalize_before=normalize_before,
        )
        attention_dim = encoder_output_size

        self.decoders = repeat(
            num_blocks,
            lambda lnum: DecoderLayer(
                attention_dim,
                DynamicConvolution2D(
                    wshare=conv_wshare,
                    n_feat=attention_dim,
                    dropout_rate=self_attention_dropout_rate,
                    kernel_size=conv_kernel_length[lnum],
                    use_kernel_mask=True,
                    use_bias=conv_usebias,
                ),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
Exemplo n.º 3
0
 def __init__(self, odim,
              attention_dim=256,
              attention_heads=4,
              linear_units=2048,
              num_blocks=6,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              self_attention_dropout_rate=0.0,
              src_attention_dropout_rate=0.0,
              input_layer="embed",
              use_output_layer=True,
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False):
     """Construct an Decoder object."""
     torch.nn.Module.__init__(self)
     if input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(odim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate)
         )
     elif input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(odim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate),
             torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate)
         )
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer,
             pos_enc_class(attention_dim, positional_dropout_rate)
         )
     else:
         raise NotImplementedError("only `embed` or torch.nn.Module is supported.")
     self.normalize_before = normalize_before
     self.decoders = repeat(
         num_blocks,
         lambda: DecoderLayer(
             attention_dim,
             MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate),
             MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate),
             PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
             dropout_rate,
             normalize_before,
             concat_after
         )
     )
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
     if use_output_layer:
         self.output_layer = torch.nn.Linear(attention_dim, odim)
     else:
         self.output_layer = None
Exemplo n.º 4
0
    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        self_attention_dropout_rate: float = 0.0,
        src_attention_dropout_rate: float = 0.0,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
    ):
        assert check_argument_types()
        super().__init__(
            vocab_size=vocab_size,
            encoder_output_size=encoder_output_size,
            dropout_rate=dropout_rate,
            positional_dropout_rate=positional_dropout_rate,
            input_layer=input_layer,
            use_output_layer=use_output_layer,
            pos_enc_class=pos_enc_class,
            normalize_before=normalize_before,
        )

        attention_dim = encoder_output_size
        self.decoders = repeat(
            num_blocks,
            lambda lnum: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     self_attention_dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
Exemplo n.º 5
0
 def __init__(self, odim, args):
     super(Decoder, self).__init__()
     self.embed = torch.nn.Sequential(
         torch.nn.Embedding(odim, args.adim),
         PositionalEncoding(args.adim, args.dropout_rate)
     )
     self.decoders = repeat(
         args.dlayers,
         lambda: DecoderLayer(
             args.adim,
             MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate),
             MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate),
             PositionwiseFeedForward(args.adim, args.dunits, args.dropout_rate),
             args.dropout_rate
         )
     )
     self.output_norm = LayerNorm(args.adim)
     self.output_layer = torch.nn.Linear(args.adim, odim)
Exemplo n.º 6
0
    def __init__(
        self,
        odim,
        selfattention_layer_type="selfattn",
        attention_dim=256,
        attention_heads=4,
        conv_wshare=4,
        conv_kernel_length=11,
        conv_usebias=False,
        linear_units=2048,
        num_blocks=6,
        dropout_rate=0.1,
        positional_dropout_rate=0.1,
        self_attention_dropout_rate=0.0,
        src_attention_dropout_rate=0.0,
        input_layer="embed",
        use_output_layer=True,
        pos_enc_class=PositionalEncoding,
        normalize_before=True,
        concat_after=False,
    ):
        """Construct an Decoder object."""
        torch.nn.Module.__init__(self)
        self._register_load_state_dict_pre_hook(_pre_hook)
        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(odim, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(odim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate))
        else:
            raise NotImplementedError(
                "only `embed` or torch.nn.Module is supported.")
        self.normalize_before = normalize_before

        # self-attention module definition
        if selfattention_layer_type == "selfattn":
            logging.info("decoder self-attention layer type = self-attention")
            decoder_selfattn_layer = MultiHeadedAttention
            decoder_selfattn_layer_args = [(
                attention_heads,
                attention_dim,
                self_attention_dropout_rate,
            )] * num_blocks
        elif selfattention_layer_type == "lightconv":
            logging.info(
                "decoder self-attention layer type = lightweight convolution")
            decoder_selfattn_layer = LightweightConvolution
            decoder_selfattn_layer_args = [(
                conv_wshare,
                attention_dim,
                self_attention_dropout_rate,
                int(conv_kernel_length.split("_")[lnum]),
                True,
                conv_usebias,
            ) for lnum in range(num_blocks)]
        elif selfattention_layer_type == "lightconv2d":
            logging.info("decoder self-attention layer "
                         "type = lightweight convolution 2-dimentional")
            decoder_selfattn_layer = LightweightConvolution2D
            decoder_selfattn_layer_args = [(
                conv_wshare,
                attention_dim,
                self_attention_dropout_rate,
                int(conv_kernel_length.split("_")[lnum]),
                True,
                conv_usebias,
            ) for lnum in range(num_blocks)]
        elif selfattention_layer_type == "dynamicconv":
            logging.info(
                "decoder self-attention layer type = dynamic convolution")
            decoder_selfattn_layer = DynamicConvolution
            decoder_selfattn_layer_args = [(
                conv_wshare,
                attention_dim,
                self_attention_dropout_rate,
                int(conv_kernel_length.split("_")[lnum]),
                True,
                conv_usebias,
            ) for lnum in range(num_blocks)]
        elif selfattention_layer_type == "dynamicconv2d":
            logging.info(
                "decoder self-attention layer type = dynamic convolution 2-dimentional"
            )
            decoder_selfattn_layer = DynamicConvolution2D
            decoder_selfattn_layer_args = [(
                conv_wshare,
                attention_dim,
                self_attention_dropout_rate,
                int(conv_kernel_length.split("_")[lnum]),
                True,
                conv_usebias,
            ) for lnum in range(num_blocks)]

        self.decoders = repeat(
            num_blocks,
            lambda lnum: DecoderLayer(
                attention_dim,
                decoder_selfattn_layer(*decoder_selfattn_layer_args[lnum]),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        self.selfattention_layer_type = selfattention_layer_type
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
        if use_output_layer:
            self.output_layer = torch.nn.Linear(attention_dim, odim)
        else:
            self.output_layer = None
Exemplo n.º 7
0
 def __init__(
     self,
     odim,
     selfattention_layer_type="selfattn",
     attention_dim=256,
     attention_heads=4,
     conv_wshare=4,
     conv_kernel_length=11,
     conv_usebias=False,
     linear_units=2048,
     num_blocks=6,
     dropout_rate=0.1,
     positional_dropout_rate=0.1,
     self_attention_dropout_rate=0.0,
     src_attention_dropout_rate=0.0,
     input_layer="embed",
     use_output_layer=True,
     pos_enc_class=PositionalEncoding,
     normalize_before=True,
     concat_after=False,
     attention_type="self_attn",
     max_attn_span=[None],
     span_init=0,
     span_ratio=0.5,
     ratio_adaptive=False
 ):
     """Construct an Decoder object."""
     torch.nn.Module.__init__(self)
     self._register_load_state_dict_pre_hook(_pre_hook)
     if input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(odim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     elif input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(odim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate),
             torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer, pos_enc_class(attention_dim, positional_dropout_rate)
         )
     else:
         raise NotImplementedError("only `embed` or torch.nn.Module is supported.")
     self.normalize_before = normalize_before
     if selfattention_layer_type == "selfattn":
         logging.info("decoder self-attention layer type = self-attention")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 multi_headed_attention(attention_heads, attention_dim, self_attention_dropout_rate,
                                        attention_type, max_span=max_attn_span[min(len(max_attn_span)-1, lnum)],
                                        span_init=span_init, span_ratio=span_ratio, ratio_adaptive=ratio_adaptive,
                                        causal_flag=True),
                 MultiHeadedAttention(
                     attention_heads, attention_dim, src_attention_dropout_rate
                 ),
                 PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "lightconv":
         logging.info("decoder self-attention layer type = lightweight convolution")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 LightweightConvolution(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(
                     attention_heads, attention_dim, src_attention_dropout_rate
                 ),
                 PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "lightconv2d":
         logging.info(
             "decoder self-attention layer "
             "type = lightweight convolution 2-dimentional"
         )
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 LightweightConvolution2D(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(
                     attention_heads, attention_dim, src_attention_dropout_rate
                 ),
                 PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "dynamicconv":
         logging.info("decoder self-attention layer type = dynamic convolution")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 DynamicConvolution(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(
                     attention_heads, attention_dim, src_attention_dropout_rate
                 ),
                 PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "dynamicconv2d":
         logging.info(
             "decoder self-attention layer type = dynamic convolution 2-dimentional"
         )
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 DynamicConvolution2D(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(
                     attention_heads, attention_dim, src_attention_dropout_rate
                 ),
                 PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     self.selfattention_layer_type = selfattention_layer_type
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
     if use_output_layer:
         self.output_layer = torch.nn.Linear(attention_dim, odim)
     else:
         self.output_layer = None
    def __init__(self,
                 odim,
                 attention_dim=256,
                 attention_heads=4,
                 linear_units=2048,
                 num_blocks=6,
                 dropout_rate=0.1,
                 positional_dropout_rate=0.1,
                 self_attention_dropout_rate=0.0,
                 src_attention_dropout_rate=0.0,
                 input_layer="embed",
                 use_output_layer=True,
                 pos_enc_class=PositionalEncoding,
                 normalize_before=True,
                 concat_after=False,
                 cross_operator=None,
                 cross_shared=False,
                 cross_weight_learnable=False,
                 cross_weight=0.0):
        """Construct an Decoder object."""
        torch.nn.Module.__init__(self)
        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(odim, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate))
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(odim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate), torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate))
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate))
        else:
            raise NotImplementedError(
                "only `embed` or torch.nn.Module is supported.")
        self.normalize_before = normalize_before

        cross_self_attn = None
        cross_src_attn = None
        if cross_operator:
            if 'src_' in cross_operator:
                # cross_src_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate)
                cross_src_attn = True

            if 'self_' in cross_operator:
                if cross_shared and cross_src_attn is not None:
                    # cross_self_attn = cross_src_attn
                    cross_self_attn = True  # TODO: backward compatibility for shared self and source
                else:
                    # cross_self_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate)
                    cross_self_attn = True
            if 'concat' in cross_operator:
                cross_operator = 'concat'
            elif 'sum' in cross_operator:
                cross_operator = 'sum'
            else:
                raise NotImplementedError

        self.decoders = repeat(
            num_blocks, lambda: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     self_attention_dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
                cross_self_attn=MultiHeadedAttention(
                    attention_heads, attention_dim, self_attention_dropout_rate
                ) if cross_self_attn else None,
                cross_src_attn=MultiHeadedAttention(
                    attention_heads, attention_dim, self_attention_dropout_rate
                ) if cross_src_attn else None,
                cross_operator=cross_operator,
                cross_shared=cross_shared,
                cross_weight_learnable=cross_weight_learnable,
                cross_weight=cross_weight))

        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
        if use_output_layer:
            self.output_layer = torch.nn.Linear(attention_dim, odim)
        else:
            self.output_layer = None