Пример #1
0
 def __init__(self,
              idim,
              time_len=8,
              mem_len=0,
              ext_len=0,
              future_len=0,
              attention_type="memory",
              attention_dim=256,
              attention_heads=4,
              linear_units=2048,
              num_blocks=6,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              attention_dropout_rate=0.0,
              input_layer="conv2d",
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False):
     super(Encoder, self).__init__()
     self.idim = idim
     self.time_len = time_len
     self.future_len = future_len
     self.attention_dim = attention_dim
     self.attention_heads = attention_heads
     self.linear_units = linear_units
     self.dropout_rate = dropout_rate
     self.input_layer = input_layer
     self.normalize_before = normalize_before
     self.concat_after = concat_after
     self.attention_type = attention_type
     self.positional_dropout_rate = positional_dropout_rate
     self.pos_enc_class = pos_enc_class
     self._generateInputLayer()
     if attention_type == "memory":
         self.encoders = repeat(
             num_blocks, lambda:
             EncoderLayerXL(n_head=attention_heads,
                            d_model=attention_dim,
                            d_head=attention_dim // attention_heads,
                            ext_len=ext_len,
                            mem_len=mem_len,
                            future_len=future_len,
                            dropout=dropout_rate,
                            dropatt=attention_dropout_rate,
                            pre_lnorm=normalize_before,
                            pos_ff=PositionwiseFeedForward(
                                attention_dim, linear_units, dropout_rate)))
     elif attention_type == "traditional":
         self.encoders = repeat(
             num_blocks, lambda: EncoderLayerTD(
                 attention_dim,
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate), dropout_rate,
                 normalize_before, concat_after))
     else:
         ValueError("only memory or traditional can be used")
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
Пример #2
0
def build_conformer_block(
    block: Dict[str, Any],
    self_attn_class: str,
    pw_layer_type: str,
    pw_activation_type: str,
    conv_mod_activation_type: str,
) -> ConformerEncoderLayer:
    """Build function for conformer block.

    Args:
        block: Conformer block parameters.
        self_attn_type: Self-attention module type.
        pw_layer_type: Positionwise layer type.
        pw_activation_type: Positionwise activation type.
        conv_mod_activation_type: Convolutional module activation type.

    Returns:
        : Function to create conformer (encoder) block.

    """
    d_hidden = block["d_hidden"]
    d_ff = block["d_ff"]

    dropout_rate = block.get("dropout-rate", 0.0)
    pos_dropout_rate = block.get("pos-dropout-rate", 0.0)
    att_dropout_rate = block.get("att-dropout-rate", 0.0)

    if pw_layer_type == "linear":
        pw_layer = PositionwiseFeedForward(
            d_hidden,
            d_ff,
            pos_dropout_rate,
            get_activation(pw_activation_type),
        )
    else:
        raise NotImplementedError("Conformer block only supports linear yet.")

    macaron_net = (PositionwiseFeedForward(
        d_hidden,
        d_ff,
        pos_dropout_rate,
        get_activation(pw_activation_type),
    ) if block["macaron_style"] else None)

    conv_mod = (ConvolutionModule(
        d_hidden,
        block["conv_mod_kernel"],
        get_activation(conv_mod_activation_type),
    ) if block["use_conv_mod"] else None)

    return lambda: ConformerEncoderLayer(
        d_hidden,
        self_attn_class(block["heads"], d_hidden, att_dropout_rate),
        pw_layer,
        macaron_net,
        conv_mod,
        dropout_rate,
    )
Пример #3
0
 def _generateEncoderLayer(self):
     subsize = 1
     if self.input_layer == "conv2d":
         subsize = 4
         subsample = None
     else:
         if self.subtype == "max":
             subsample = PoolSubsampling()
         elif self.subtype == "average":
             subsample = PoolSubsampling(type="average")
         else:
             subsample = NormalSubsamplingPos()
     for i in range(self.num_blocks):
         if self.input_layer != "conv2d":
             for sp in self.subpos:
                 if sp == i:
                     self.encoders.append(subsample)
                     subsize *= 2
         self.encoders.append(
             EncoderLayer(n_head=self.attention_heads,
                          d_model=self.attention_dim,
                          d_head=self.attention_dim // self.attention_heads,
                          att_type=self.att_type,
                          ext_len=self.hop_len // subsize,
                          mem_len=self.mem_len // subsize,
                          tgt_len=self.center_len,
                          future_len=self.right_len,
                          rel_pos=self.rel_pos,
                          dropout=self.dropout_rate,
                          dropatt=self.attention_dropout_rate,
                          pre_lnorm=self.normalize_before,
                          pos_ff=PositionwiseFeedForward(
                              self.attention_dim, self.linear_units,
                              self.dropout_rate)))
Пример #4
0
    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        self_attention_dropout_rate: float = 0.0,
        src_attention_dropout_rate: float = 0.0,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
    ):
        assert check_argument_types()
        super().__init__()
        attention_dim = encoder_output_size

        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(vocab_size, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(vocab_size, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        else:
            raise ValueError(
                f"only 'embed' or 'linear' is supported: {input_layer}")

        self.normalize_before = normalize_before
        self.decoders = repeat(
            num_blocks,
            lambda: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     self_attention_dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
        if use_output_layer:
            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
        else:
            self.output_layer = None
Пример #5
0
    def __init__(
        self,
        odim,
        jdim,
        attention_dim=512,
        attention_heads=4,
        linear_units=2048,
        num_blocks=6,
        dropout_rate=0.1,
        positional_dropout_rate=0.0,
        attention_dropout_rate=0.0,
        input_layer="embed",
        pos_enc_class=PositionalEncoding,
        blank=0,
    ):
        """Construct a Decoder object for transformer-transducer models."""
        torch.nn.Module.__init__(self)

        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(odim, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(odim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer, pos_enc_class(attention_dim, positional_dropout_rate)
            )
        else:
            raise NotImplementedError("only `embed` or torch.nn.Module is supported.")

        self.decoders = repeat(
            num_blocks,
            lambda lnum: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(
                    attention_heads, attention_dim, attention_dropout_rate
                ),
                PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                dropout_rate,
            ),
        )

        self.after_norm = LayerNorm(attention_dim)

        self.lin_enc = torch.nn.Linear(attention_dim, jdim)
        self.lin_dec = torch.nn.Linear(attention_dim, jdim, bias=False)
        self.lin_out = torch.nn.Linear(jdim, odim)

        self.attention_dim = attention_dim
        self.odim = odim

        self.blank = blank
Пример #6
0
    def __init__(self, idim, args):
        super(Encoder, self).__init__()
        if args.transformer_input_layer == "linear":
            self.input_layer = torch.nn.Sequential(
                torch.nn.Linear(idim, args.adim),
                torch.nn.LayerNorm(args.adim),
                torch.nn.Dropout(args.dropout_rate), torch.nn.ReLU(),
                PositionalEncoding(args.adim, args.dropout_rate))
        elif args.transformer_input_layer == "conv2d":
            self.input_layer = Conv2dSubsampling(idim, args.adim,
                                                 args.dropout_rate)
        elif args.transformer_input_layer == "embed":
            self.input_layer = torch.nn.Sequential(
                torch.nn.Embedding(idim, args.adim),
                PositionalEncoding(args.adim, args.dropout_rate))
        else:
            raise ValueError("unknown input_layer: " +
                             args.transformer_input_layer)

        self.encoders = repeat(
            args.elayers, lambda: EncoderLayer(
                args.adim,
                MultiHeadedAttention(args.aheads, args.adim, args.
                                     transformer_attn_dropout_rate),
                PositionwiseFeedForward(args.adim, args.eunits, args.
                                        dropout_rate), args.dropout_rate))
        self.norm = LayerNorm(args.adim)
Пример #7
0
 def __init__(self,
              odim,
              attention_dim=256,
              attention_heads=4,
              linear_units=2048,
              num_blocks=6,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              self_attention_dropout_rate=0.0,
              src_attention_dropout_rate=0.0,
              input_layer="embed",
              use_output_layer=True,
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False,
              moe_att_mode='linear'):
     """Construct an Decoder object."""
     torch.nn.Module.__init__(self)
     if input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(odim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(odim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate), torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer,
             pos_enc_class(attention_dim, positional_dropout_rate))
     else:
         raise NotImplementedError(
             "only `embed` or torch.nn.Module is supported.")
     self.normalize_before = normalize_before
     self.decoders = repeat(
         num_blocks, lambda: HANDecoderLayer(
             attention_dim,
             MultiHeadedAttention(attention_heads, attention_dim,
                                  self_attention_dropout_rate),
             MultiHeadedAttention(attention_heads, attention_dim,
                                  src_attention_dropout_rate),
             MultiHeadedAttention(attention_heads, attention_dim,
                                  src_attention_dropout_rate),
             PositionwiseFeedForward(attention_dim, linear_units,
                                     dropout_rate),
             dropout_rate=dropout_rate,
             moe_att_mode=moe_att_mode,
             normalize_before=normalize_before,
             concat_after=concat_after,
         ))
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
     if use_output_layer:
         self.output_layer = torch.nn.Linear(attention_dim, odim)
     else:
         self.output_layer = None
Пример #8
0
    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        self_attention_dropout_rate: float = 0.0,
        src_attention_dropout_rate: float = 0.0,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
        conv_wshare: int = 4,
        conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11),
        conv_usebias: int = False,
    ):
        assert check_argument_types()
        if len(conv_kernel_length) != num_blocks:
            raise ValueError(
                "conv_kernel_length must have equal number of values to num_blocks: "
                f"{len(conv_kernel_length)} != {num_blocks}")
        super().__init__(
            vocab_size=vocab_size,
            encoder_output_size=encoder_output_size,
            dropout_rate=dropout_rate,
            positional_dropout_rate=positional_dropout_rate,
            input_layer=input_layer,
            use_output_layer=use_output_layer,
            pos_enc_class=pos_enc_class,
            normalize_before=normalize_before,
        )
        attention_dim = encoder_output_size

        self.decoders = repeat(
            num_blocks,
            lambda lnum: DecoderLayer(
                attention_dim,
                DynamicConvolution2D(
                    wshare=conv_wshare,
                    n_feat=attention_dim,
                    dropout_rate=self_attention_dropout_rate,
                    kernel_size=conv_kernel_length[lnum],
                    use_kernel_mask=True,
                    use_bias=conv_usebias,
                ),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
Пример #9
0
    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        self_attention_dropout_rate: float = 0.0,
        src_attention_dropout_rate: float = 0.0,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
    ):
        assert check_argument_types()
        super().__init__(
            vocab_size=vocab_size,
            encoder_output_size=encoder_output_size,
            dropout_rate=dropout_rate,
            positional_dropout_rate=positional_dropout_rate,
            input_layer=input_layer,
            use_output_layer=use_output_layer,
            pos_enc_class=pos_enc_class,
            normalize_before=normalize_before,
        )

        attention_dim = encoder_output_size
        self.decoders = repeat(
            num_blocks,
            lambda lnum: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     self_attention_dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
Пример #10
0
 def __init__(self, odim, args):
     super(Decoder, self).__init__()
     self.embed = torch.nn.Sequential(
         torch.nn.Embedding(odim, args.adim),
         PositionalEncoding(args.adim, args.dropout_rate)
     )
     self.decoders = repeat(
         args.dlayers,
         lambda: DecoderLayer(
             args.adim,
             MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate),
             MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate),
             PositionwiseFeedForward(args.adim, args.dunits, args.dropout_rate),
             args.dropout_rate
         )
     )
     self.output_norm = LayerNorm(args.adim)
     self.output_layer = torch.nn.Linear(args.adim, odim)
Пример #11
0
 def __init__(self,
              idim,
              attention_dim=256,
              attention_heads=4,
              linear_units=2048,
              num_blocks=6,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              attention_dropout_rate=0.0,
              input_layer="conv2d",
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False):
     super(Encoder, self).__init__()
     if input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(idim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate), torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif input_layer == "conv2d":
         self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate)
     elif input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(idim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer,
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     else:
         raise ValueError("unknown input_layer: " + input_layer)
     self.normalize_before = normalize_before
     self.encoders = repeat(
         num_blocks, lambda: EncoderLayer(
             attention_dim,
             MultiHeadedAttention(attention_heads, attention_dim,
                                  attention_dropout_rate),
             PositionwiseFeedForward(attention_dim, linear_units,
                                     dropout_rate), dropout_rate,
             normalize_before, concat_after))
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
Пример #12
0
def build_transformer_block(
    net_part: str,
    block: Dict[str, Any],
    pw_layer_type: str,
    pw_activation_type: str,
) -> Union[EncoderLayer, TransformerDecoderLayer]:
    """Build function for transformer block.

    Args:
        net_part: Network part, either 'encoder' or 'decoder'.
        block: Transformer block parameters.
        pw_layer_type: Positionwise layer type.
        pw_activation_type: Positionwise activation type.

    Returns:
        : Function to create transformer (encoder or decoder) block.

    """
    d_hidden = block["d_hidden"]

    dropout_rate = block.get("dropout-rate", 0.0)
    pos_dropout_rate = block.get("pos-dropout-rate", 0.0)
    att_dropout_rate = block.get("att-dropout-rate", 0.0)

    if pw_layer_type != "linear":
        raise NotImplementedError(
            "Transformer block only supports linear pointwise layer.")

    if net_part == "encoder":
        transformer_layer_class = EncoderLayer
    elif net_part == "decoder":
        transformer_layer_class = TransformerDecoderLayer

    return lambda: transformer_layer_class(
        d_hidden,
        MultiHeadedAttention(block["heads"], d_hidden, att_dropout_rate),
        PositionwiseFeedForward(
            d_hidden,
            block["d_ff"],
            pos_dropout_rate,
            get_activation(pw_activation_type),
        ),
        dropout_rate,
    )
Пример #13
0
    def __init__(self, idim, odim, args, ignore_id=-1, blank_id=0):
        """Construct an E2E object for transducer model."""
        torch.nn.Module.__init__(self)

        if "transformer" in args.etype:
            if args.enc_block_arch is None:
                raise ValueError(
                    "Transformer-based blocks in transducer mode should be"
                    "defined individually in the YAML file."
                    "See egs/vivos/asr1/conf/transducer/* for more info.")

            self.subsample = get_subsample(args,
                                           mode="asr",
                                           arch="transformer")
            # 2. use transformer to joint feature maps
            # transformer without positional encoding

            self.clayers = repeat(
                2,
                lambda lnum: EncoderLayer(
                    16,
                    MultiHeadedAttention(4, 16, 0.1),
                    PositionwiseFeedForward(16, 2048, 0.1),
                    dropout_rate=0.1,
                    normalize_before=True,
                    concat_after=False,
                ),
            )

            self.conv = torch.nn.Sequential(
                torch.nn.Conv2d(1, 32, kernel_size=(3, 5), stride=(1, 2)),
                torch.nn.ReLU(),
                torch.nn.Conv2d(32, 32, kernel_size=(3, 7), stride=(2, 2)),
                torch.nn.ReLU())

            self.encoder = Encoder(
                idim,
                args.enc_block_arch,
                input_layer=args.transformer_enc_input_layer,
                repeat_block=args.enc_block_repeat,
                self_attn_type=args.transformer_enc_self_attn_type,
                positional_encoding_type=args.
                transformer_enc_positional_encoding_type,
                positionwise_activation_type=args.
                transformer_enc_pw_activation_type,
                conv_mod_activation_type=args.
                transformer_enc_conv_mod_activation_type,
            )
            encoder_out = self.encoder.enc_out
            args.eprojs = self.encoder.enc_out

            self.most_dom_list = args.enc_block_arch[:]
        else:
            self.subsample = get_subsample(args, mode="asr", arch="rnn-t")

            self.enc = encoder_for(args, idim, self.subsample)

            encoder_out = args.eprojs

        if "transformer" in args.dtype:
            if args.dec_block_arch is None:
                raise ValueError(
                    "Transformer-based blocks in transducer mode should be"
                    "defined individually in the YAML file."
                    "See egs/vivos/asr1/conf/transducer/* for more info.")

            self.decoder = DecoderTT(
                odim,
                encoder_out,
                args.joint_dim,
                args.dec_block_arch,
                input_layer=args.transformer_dec_input_layer,
                repeat_block=args.dec_block_repeat,
                joint_activation_type=args.joint_activation_type,
                positionwise_activation_type=args.
                transformer_dec_pw_activation_type,
                dropout_rate_embed=args.dropout_rate_embed_decoder,
            )

            if "transformer" in args.etype:
                self.most_dom_list += args.dec_block_arch[:]
            else:
                self.most_dom_list = args.dec_block_arch[:]
        else:
            if args.rnnt_mode == "rnnt-att":
                self.att = att_for(args)

                self.dec = DecoderRNNTAtt(
                    args.eprojs,
                    odim,
                    args.dtype,
                    args.dlayers,
                    args.dunits,
                    blank_id,
                    self.att,
                    args.dec_embed_dim,
                    args.joint_dim,
                    args.joint_activation_type,
                    args.dropout_rate_decoder,
                    args.dropout_rate_embed_decoder,
                )
            else:
                self.dec = DecoderRNNT(
                    args.eprojs,
                    odim,
                    args.dtype,
                    args.dlayers,
                    args.dunits,
                    blank_id,
                    args.dec_embed_dim,
                    args.joint_dim,
                    args.joint_activation_type,
                    args.dropout_rate_decoder,
                    args.dropout_rate_embed_decoder,
                )

        if hasattr(self, "most_dom_list"):
            self.most_dom_dim = sorted(
                Counter(d["d_hidden"] for d in self.most_dom_list
                        if "d_hidden" in d).most_common(),
                key=lambda x: x[0],
                reverse=True,
            )[0][0]

        self.etype = args.etype
        self.dtype = args.dtype
        self.rnnt_mode = args.rnnt_mode

        self.sos = odim - 1
        self.eos = odim - 1
        self.blank_id = blank_id
        self.ignore_id = ignore_id

        self.space = args.sym_space
        self.blank = args.sym_blank

        self.odim = odim

        self.reporter = Reporter()

        self.criterion = TransLoss(args.trans_type, self.blank_id)

        self.default_parameters(args)

        if args.report_cer or args.report_wer:
            from espnet.nets.e2e_asr_common import ErrorCalculatorTransducer

            if self.dtype == "transformer":
                decoder = self.decoder
            else:
                decoder = self.dec

            self.error_calculator = ErrorCalculatorTransducer(
                decoder,
                args.char_list,
                args.sym_space,
                args.sym_blank,
                args.report_cer,
                args.report_wer,
            )
        else:
            self.error_calculator = None

        self.loss = None
        self.rnnlm = None
Пример #14
0
 def __init__(
     self,
     odim,
     selfattention_layer_type="selfattn",
     attention_dim=256,
     attention_heads=4,
     conv_wshare=4,
     conv_kernel_length=11,
     conv_usebias=False,
     linear_units=2048,
     num_blocks=6,
     dropout_rate=0.1,
     positional_dropout_rate=0.1,
     self_attention_dropout_rate=0.0,
     src_attention_dropout_rate=0.0,
     input_layer="embed",
     use_output_layer=True,
     pos_enc_class=PositionalEncoding,
     normalize_before=True,
     concat_after=False,
 ):
     """Construct an Decoder object."""
     torch.nn.Module.__init__(self)
     self._register_load_state_dict_pre_hook(_pre_hook)
     if input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(odim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     elif input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(odim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate),
             torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer,
             pos_enc_class(attention_dim, positional_dropout_rate))
     else:
         raise NotImplementedError(
             "only `embed` or torch.nn.Module is supported.")
     self.normalize_before = normalize_before
     if selfattention_layer_type == "selfattn":
         logging.info("decoder self-attention layer type = self-attention")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      self_attention_dropout_rate),
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      src_attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "lightconv":
         logging.info(
             "decoder self-attention layer type = lightweight convolution")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 LightweightConvolution(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      src_attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "lightconv2d":
         logging.info("decoder self-attention layer "
                      "type = lightweight convolution 2-dimentional")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 LightweightConvolution2D(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      src_attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "dynamicconv":
         logging.info(
             "decoder self-attention layer type = dynamic convolution")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 DynamicConvolution(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      src_attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "dynamicconv2d":
         logging.info(
             "decoder self-attention layer type = dynamic convolution 2-dimentional"
         )
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 DynamicConvolution2D(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      src_attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     self.selfattention_layer_type = selfattention_layer_type
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
     if use_output_layer:
         self.output_layer = torch.nn.Linear(attention_dim, odim)
     else:
         self.output_layer = None
 def __init__(
     self,
     languages,
     odim_dict,
     selfattention_layer_type="selfattn",
     attention_dim=256,
     attention_heads=4,
     conv_wshare=4,
     conv_kernel_length=11,
     conv_usebias=False,
     linear_units=2048,
     num_blocks=6,
     dropout_rate=0.1,
     positional_dropout_rate=0.1,
     self_attention_dropout_rate=0.0,
     src_attention_dropout_rate=0.0,
     input_layer="embed",
     use_output_layer=True,
     pos_enc_class=PositionalEncoding,
     normalize_before=True,
     concat_after=False,
     sim_adapter=False,
     shared_adapter=False,
     use_adapters=True,
     fusion_languages=None,
 ):
     super().__init__(1, selfattention_layer_type, attention_dim,
                      attention_heads, conv_wshare, conv_kernel_length,
                      conv_usebias, linear_units, num_blocks, dropout_rate,
                      positional_dropout_rate, self_attention_dropout_rate,
                      src_attention_dropout_rate, input_layer,
                      use_output_layer, pos_enc_class, normalize_before,
                      concat_after)
     if input_layer == "embed":
         self.embed = torch.nn.ModuleDict()
         for lang in odim_dict.keys():
             self.embed[lang] = torch.nn.Sequential(
                 torch.nn.Embedding(odim_dict[lang], attention_dim),
                 pos_enc_class(attention_dim, positional_dropout_rate),
             )
     else:
         raise NotImplementedError("only support embed embedding layer")
     assert self_attention_dropout_rate == src_attention_dropout_rate
     if selfattention_layer_type == "selfattn":
         logging.info("decoder self-attention layer type = self-attention")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: AdaptiveDecoderLayer(
                 languages,
                 attention_dim,
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      self_attention_dropout_rate),
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      src_attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
                 torch.nn.ModuleDict({
                     "_".join(sorted(fusion_languages)):
                     SimAdapter(attention_dim, self_attention_dropout_rate,
                                fusion_languages)
                 }) if sim_adapter else None,
                 shared_adapter,
                 use_adapters,
             ),
         )
     else:
         raise NotImplementedError(
             "Only support self-attention decoder layer")
     if use_output_layer:
         self.output_layer = torch.nn.ModuleDict()
         for lang in odim_dict.keys():
             self.output_layer[lang] = torch.nn.Linear(
                 attention_dim, odim_dict[lang])
     else:
         self.output_layer = None
Пример #16
0
def build_conformer_block(
    configuration: List[Dict[str, Any]],
    main_params: Dict[str, Any],
) -> Conformer:
    """Build Conformer block.

    Args:
        configuration: Conformer block configuration.
        main_params: Encoder main parameters.

    Returns:
        : Conformer block function.

    """
    hidden_size = configuration["hidden_size"]
    linear_size = configuration["linear_size"]

    pos_wise_args = (
        hidden_size,
        linear_size,
        configuration.get("pos_wise_dropout_rate", 0.0),
        main_params["pos_wise_act"],
    )

    conv_mod_norm_class, conv_mod_norm_args = get_normalization(
        main_params["conv_mod_norm_type"],
        eps=configuration.get("conv_mod_norm_eps"),
        momentum=configuration.get("conv_mod_norm_momentum"),
        partial=configuration.get("conv_mod_norm_partial"),
    )

    conv_mod_args = (
        hidden_size,
        configuration["conv_mod_kernel_size"],
        main_params["conv_mod_act"],
        conv_mod_norm_class,
        conv_mod_norm_args,
        main_params["dynamic_chunk_training"],
    )

    mult_att_args = (
        configuration.get("heads", 4),
        hidden_size,
        configuration.get("att_dropout_rate", 0.0),
        main_params["simplified_att_score"],
    )

    norm_class, norm_args = get_normalization(
        main_params["norm_type"],
        eps=configuration.get("norm_eps"),
        partial=configuration.get("norm_partial"),
    )

    return lambda: Conformer(
        hidden_size,
        RelPositionMultiHeadedAttention(*mult_att_args),
        PositionwiseFeedForward(*pos_wise_args),
        PositionwiseFeedForward(*pos_wise_args),
        ConformerConvolution(*conv_mod_args),
        norm_class=norm_class,
        norm_args=norm_args,
        dropout_rate=configuration.get("dropout_rate", 0.0),
    )
    def __init__(self,
                 odim,
                 attention_dim=256,
                 attention_heads=4,
                 linear_units=2048,
                 num_blocks=6,
                 dropout_rate=0.1,
                 positional_dropout_rate=0.1,
                 self_attention_dropout_rate=0.0,
                 src_attention_dropout_rate=0.0,
                 input_layer="embed",
                 use_output_layer=True,
                 pos_enc_class=PositionalEncoding,
                 normalize_before=True,
                 concat_after=False,
                 cross_operator=None,
                 cross_shared=False,
                 cross_weight_learnable=False,
                 cross_weight=0.0):
        """Construct an Decoder object."""
        torch.nn.Module.__init__(self)
        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(odim, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate))
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(odim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate), torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate))
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate))
        else:
            raise NotImplementedError(
                "only `embed` or torch.nn.Module is supported.")
        self.normalize_before = normalize_before

        cross_self_attn = None
        cross_src_attn = None
        if cross_operator:
            if 'src_' in cross_operator:
                # cross_src_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate)
                cross_src_attn = True

            if 'self_' in cross_operator:
                if cross_shared and cross_src_attn is not None:
                    # cross_self_attn = cross_src_attn
                    cross_self_attn = True  # TODO: backward compatibility for shared self and source
                else:
                    # cross_self_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate)
                    cross_self_attn = True
            if 'concat' in cross_operator:
                cross_operator = 'concat'
            elif 'sum' in cross_operator:
                cross_operator = 'sum'
            else:
                raise NotImplementedError

        self.decoders = repeat(
            num_blocks, lambda: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     self_attention_dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
                cross_self_attn=MultiHeadedAttention(
                    attention_heads, attention_dim, self_attention_dropout_rate
                ) if cross_self_attn else None,
                cross_src_attn=MultiHeadedAttention(
                    attention_heads, attention_dim, self_attention_dropout_rate
                ) if cross_src_attn else None,
                cross_operator=cross_operator,
                cross_shared=cross_shared,
                cross_weight_learnable=cross_weight_learnable,
                cross_weight=cross_weight))

        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
        if use_output_layer:
            self.output_layer = torch.nn.Linear(attention_dim, odim)
        else:
            self.output_layer = None
    def __init__(self, idim, odim, args, ignore_id=-1):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        torch.nn.Module.__init__(self)
        if args.transformer_attn_dropout_rate is None:
            args.transformer_attn_dropout_rate = args.dropout_rate
        self.cn_encoder = Encoder(
            idim=idim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer=args.transformer_input_layer,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            attention_dropout_rate=args.transformer_attn_dropout_rate)
        self.en_encoder = Encoder(
            idim=idim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer=args.transformer_input_layer,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            attention_dropout_rate=args.transformer_attn_dropout_rate)
        # gated add module
        self.vectorize_lambda = args.vectorize_lambda
        lambda_dim = args.adim if self.vectorize_lambda else 1
        self.aggregation_module = torch.nn.Sequential(
            torch.nn.Linear(2 * args.adim, lambda_dim), torch.nn.Sigmoid())
        self.additional_encoder_layer = EncoderLayer(
            args.adim,
            MultiHeadedAttention(args.aheads, args.adim,
                                 args.transformer_attn_dropout_rate),
            PositionwiseFeedForward(args.adim, args.eunits, args.dropout_rate),
            args.dropout_rate,
            normalize_before=True,
            concat_after=False)
        self.additional_after_norm = LayerNorm(args.adim)
        self.decoder = Decoder(
            odim=odim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.dunits,
            num_blocks=args.dlayers,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            self_attention_dropout_rate=args.transformer_attn_dropout_rate,
            src_attention_dropout_rate=args.transformer_attn_dropout_rate)
        self.sos = odim - 1
        self.eos = odim - 1
        self.odim = odim
        self.ignore_id = ignore_id
        self.subsample = [1]
        self.reporter = Reporter()

        # self.lsm_weight = a
        self.criterion = LabelSmoothingLoss(
            self.odim, self.ignore_id, args.lsm_weight,
            args.transformer_length_normalized_loss)
        # self.verbose = args.verbose
        self.adim = args.adim
        self.mtlalpha = args.mtlalpha
        if args.mtlalpha > 0.0:
            self.ctc = CTC(odim,
                           args.adim,
                           args.dropout_rate,
                           ctc_type=args.ctc_type,
                           reduce=True)
        else:
            self.ctc = None

        if args.report_cer or args.report_wer:
            from espnet.nets.e2e_asr_common import ErrorCalculator
            self.error_calculator = ErrorCalculator(args.char_list,
                                                    args.sym_space,
                                                    args.sym_blank,
                                                    args.report_cer,
                                                    args.report_wer)
        else:
            self.error_calculator = None
        self.rnnlm = None

        # yzl23 config
        self.remove_blank_in_ctc_mode = True
        self.reset_parameters(args)  # reset params at the last

        logging.warning(
            "Model total size: {}M, requires_grad size: {}M".format(
                self.count_parameters(),
                self.count_parameters(requires_grad=True)))
Пример #19
0
    def __init__(self,
                 idim,
                 center_len=8,
                 left_len=0,
                 hop_len=0,
                 right_len=0,
                 abs_pos=1,
                 rel_pos=0,
                 use_mem=1,
                 att_type="mta",
                 attention_dim=256,
                 attention_heads=4,
                 linear_units=2048,
                 num_blocks=6,
                 dropout_rate=0.1,
                 positional_dropout_rate=0.1,
                 attention_dropout_rate=0.0,
                 input_layer="conv2d",
                 pos_enc_class=PositionalEncoding,
                 normalize_before=True,
                 concat_after=False):
        super(Encoder, self).__init__()
        self.idim = idim
        self.center_len = center_len
        self.use_mem = use_mem != 0
        self.left_len = left_len
        if self.use_mem:
            self.mem_len = left_len
        else:
            self.mem_len = 0
        self.hop_len = hop_len
        self.right_len = right_len
        self.abs_pos = abs_pos != 0
        self.rel_pos = rel_pos != 0
        self.attention_dim = attention_dim
        self.attention_heads = attention_heads
        self.linear_units = linear_units
        self.dropout_rate = dropout_rate
        self.input_layer = input_layer
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        self.positional_dropout_rate = positional_dropout_rate
        self.pos_enc_class = pos_enc_class
        self._generateInputLayer()

        self.encoders = repeat(
            num_blocks, lambda:
            EncoderLayer(n_head=attention_heads,
                         d_model=attention_dim,
                         d_head=attention_dim // attention_heads,
                         att_type=att_type,
                         ext_len=hop_len // 4,
                         mem_len=self.mem_len // 4,
                         tgt_len=center_len,
                         future_len=right_len,
                         rel_pos=rel_pos,
                         dropout=dropout_rate,
                         dropatt=attention_dropout_rate,
                         pre_lnorm=normalize_before,
                         pos_ff=PositionwiseFeedForward(
                             attention_dim, linear_units, dropout_rate)))

        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
    def __init__(self,
                 odim,
                 attention_dim=256,
                 attention_heads=4,
                 linear_units=2048,
                 num_blocks=6,
                 dropout_rate=0.1,
                 positional_dropout_rate=0.1,
                 self_attention_dropout_rate=0.0,
                 src_attention_dropout_rate=0.0,
                 input_layer="embed",
                 use_output_layer=True,
                 pos_enc_class=PositionalEncoding,
                 normalize_before=True,
                 concat_after=False,
                 cross_operator=None,
                 cross_weight_learnable=False,
                 cross_weight=0.0,
                 cross_self=False,
                 cross_src=False,
                 cross_to_asr=True,
                 cross_to_st=True):
        """Construct an Decoder object."""
        torch.nn.Module.__init__(self)
        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(odim, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate))
            self.embed_asr = torch.nn.Sequential(
                torch.nn.Embedding(odim, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate))
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(odim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate), torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate))
            self.embed_asr = torch.nn.Sequential(
                torch.nn.Linear(odim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate), torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate))
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate))
            self.embed_asr = torch.nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate))
        else:
            raise NotImplementedError(
                "only `embed` or torch.nn.Module is supported.")
        self.normalize_before = normalize_before

        self.dual_decoders = repeat(
            num_blocks, lambda: DualDecoderLayer(
                attention_dim,
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     self_attention_dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     self_attention_dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                cross_self_attn=MultiHeadedAttention(
                    attention_heads, attention_dim, self_attention_dropout_rate
                ) if (cross_self and cross_to_st) else None,
                cross_self_attn_asr=MultiHeadedAttention(
                    attention_heads, attention_dim, self_attention_dropout_rate
                ) if (cross_self and cross_to_asr) else None,
                cross_src_attn=MultiHeadedAttention(
                    attention_heads, attention_dim, self_attention_dropout_rate
                ) if (cross_src and cross_to_st) else None,
                cross_src_attn_asr=MultiHeadedAttention(
                    attention_heads, attention_dim, self_attention_dropout_rate
                ) if (cross_src and cross_to_asr) else None,
                dropout_rate=dropout_rate,
                normalize_before=normalize_before,
                concat_after=concat_after,
                cross_operator=cross_operator,
                cross_weight_learnable=cross_weight_learnable,
                cross_weight=cross_weight,
                cross_to_asr=cross_to_asr,
                cross_to_st=cross_to_st))
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
            self.after_norm_asr = LayerNorm(attention_dim)
        if use_output_layer:
            self.output_layer = torch.nn.Linear(attention_dim, odim)
            self.output_layer_asr = torch.nn.Linear(attention_dim, odim)
        else:
            self.output_layer = None
            self.output_layer_asr = None