def __init__(self, idim, time_len=8, mem_len=0, ext_len=0, future_len=0, attention_type="memory", attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): super(Encoder, self).__init__() self.idim = idim self.time_len = time_len self.future_len = future_len self.attention_dim = attention_dim self.attention_heads = attention_heads self.linear_units = linear_units self.dropout_rate = dropout_rate self.input_layer = input_layer self.normalize_before = normalize_before self.concat_after = concat_after self.attention_type = attention_type self.positional_dropout_rate = positional_dropout_rate self.pos_enc_class = pos_enc_class self._generateInputLayer() if attention_type == "memory": self.encoders = repeat( num_blocks, lambda: EncoderLayerXL(n_head=attention_heads, d_model=attention_dim, d_head=attention_dim // attention_heads, ext_len=ext_len, mem_len=mem_len, future_len=future_len, dropout=dropout_rate, dropatt=attention_dropout_rate, pre_lnorm=normalize_before, pos_ff=PositionwiseFeedForward( attention_dim, linear_units, dropout_rate))) elif attention_type == "traditional": self.encoders = repeat( num_blocks, lambda: EncoderLayerTD( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after)) else: ValueError("only memory or traditional can be used") if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def build_conformer_block( block: Dict[str, Any], self_attn_class: str, pw_layer_type: str, pw_activation_type: str, conv_mod_activation_type: str, ) -> ConformerEncoderLayer: """Build function for conformer block. Args: block: Conformer block parameters. self_attn_type: Self-attention module type. pw_layer_type: Positionwise layer type. pw_activation_type: Positionwise activation type. conv_mod_activation_type: Convolutional module activation type. Returns: : Function to create conformer (encoder) block. """ d_hidden = block["d_hidden"] d_ff = block["d_ff"] dropout_rate = block.get("dropout-rate", 0.0) pos_dropout_rate = block.get("pos-dropout-rate", 0.0) att_dropout_rate = block.get("att-dropout-rate", 0.0) if pw_layer_type == "linear": pw_layer = PositionwiseFeedForward( d_hidden, d_ff, pos_dropout_rate, get_activation(pw_activation_type), ) else: raise NotImplementedError("Conformer block only supports linear yet.") macaron_net = (PositionwiseFeedForward( d_hidden, d_ff, pos_dropout_rate, get_activation(pw_activation_type), ) if block["macaron_style"] else None) conv_mod = (ConvolutionModule( d_hidden, block["conv_mod_kernel"], get_activation(conv_mod_activation_type), ) if block["use_conv_mod"] else None) return lambda: ConformerEncoderLayer( d_hidden, self_attn_class(block["heads"], d_hidden, att_dropout_rate), pw_layer, macaron_net, conv_mod, dropout_rate, )
def _generateEncoderLayer(self): subsize = 1 if self.input_layer == "conv2d": subsize = 4 subsample = None else: if self.subtype == "max": subsample = PoolSubsampling() elif self.subtype == "average": subsample = PoolSubsampling(type="average") else: subsample = NormalSubsamplingPos() for i in range(self.num_blocks): if self.input_layer != "conv2d": for sp in self.subpos: if sp == i: self.encoders.append(subsample) subsize *= 2 self.encoders.append( EncoderLayer(n_head=self.attention_heads, d_model=self.attention_dim, d_head=self.attention_dim // self.attention_heads, att_type=self.att_type, ext_len=self.hop_len // subsize, mem_len=self.mem_len // subsize, tgt_len=self.center_len, future_len=self.right_len, rel_pos=self.rel_pos, dropout=self.dropout_rate, dropatt=self.attention_dropout_rate, pre_lnorm=self.normalize_before, pos_ff=PositionwiseFeedForward( self.attention_dim, self.linear_units, self.dropout_rate)))
def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, ): assert check_argument_types() super().__init__() attention_dim = encoder_output_size if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(vocab_size, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(vocab_size, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise ValueError( f"only 'embed' or 'linear' is supported: {input_layer}") self.normalize_before = normalize_before self.decoders = repeat( num_blocks, lambda: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, vocab_size) else: self.output_layer = None
def __init__( self, odim, jdim, attention_dim=512, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.0, attention_dropout_rate=0.0, input_layer="embed", pos_enc_class=PositionalEncoding, blank=0, ): """Construct a Decoder object for transformer-transducer models.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate) ) else: raise NotImplementedError("only `embed` or torch.nn.Module is supported.") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, MultiHeadedAttention( attention_heads, attention_dim, attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, ), ) self.after_norm = LayerNorm(attention_dim) self.lin_enc = torch.nn.Linear(attention_dim, jdim) self.lin_dec = torch.nn.Linear(attention_dim, jdim, bias=False) self.lin_out = torch.nn.Linear(jdim, odim) self.attention_dim = attention_dim self.odim = odim self.blank = blank
def __init__(self, idim, args): super(Encoder, self).__init__() if args.transformer_input_layer == "linear": self.input_layer = torch.nn.Sequential( torch.nn.Linear(idim, args.adim), torch.nn.LayerNorm(args.adim), torch.nn.Dropout(args.dropout_rate), torch.nn.ReLU(), PositionalEncoding(args.adim, args.dropout_rate)) elif args.transformer_input_layer == "conv2d": self.input_layer = Conv2dSubsampling(idim, args.adim, args.dropout_rate) elif args.transformer_input_layer == "embed": self.input_layer = torch.nn.Sequential( torch.nn.Embedding(idim, args.adim), PositionalEncoding(args.adim, args.dropout_rate)) else: raise ValueError("unknown input_layer: " + args.transformer_input_layer) self.encoders = repeat( args.elayers, lambda: EncoderLayer( args.adim, MultiHeadedAttention(args.aheads, args.adim, args. transformer_attn_dropout_rate), PositionwiseFeedForward(args.adim, args.eunits, args. dropout_rate), args.dropout_rate)) self.norm = LayerNorm(args.adim)
def __init__(self, odim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, moe_att_mode='linear'): """Construct an Decoder object.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before self.decoders = repeat( num_blocks, lambda: HANDecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate=dropout_rate, moe_att_mode=moe_att_mode, normalize_before=normalize_before, concat_after=concat_after, )) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, conv_wshare: int = 4, conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11), conv_usebias: int = False, ): assert check_argument_types() if len(conv_kernel_length) != num_blocks: raise ValueError( "conv_kernel_length must have equal number of values to num_blocks: " f"{len(conv_kernel_length)} != {num_blocks}") super().__init__( vocab_size=vocab_size, encoder_output_size=encoder_output_size, dropout_rate=dropout_rate, positional_dropout_rate=positional_dropout_rate, input_layer=input_layer, use_output_layer=use_output_layer, pos_enc_class=pos_enc_class, normalize_before=normalize_before, ) attention_dim = encoder_output_size self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution2D( wshare=conv_wshare, n_feat=attention_dim, dropout_rate=self_attention_dropout_rate, kernel_size=conv_kernel_length[lnum], use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), )
def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, ): assert check_argument_types() super().__init__( vocab_size=vocab_size, encoder_output_size=encoder_output_size, dropout_rate=dropout_rate, positional_dropout_rate=positional_dropout_rate, input_layer=input_layer, use_output_layer=use_output_layer, pos_enc_class=pos_enc_class, normalize_before=normalize_before, ) attention_dim = encoder_output_size self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), )
def __init__(self, odim, args): super(Decoder, self).__init__() self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, args.adim), PositionalEncoding(args.adim, args.dropout_rate) ) self.decoders = repeat( args.dlayers, lambda: DecoderLayer( args.adim, MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate), MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate), PositionwiseFeedForward(args.adim, args.dunits, args.dropout_rate), args.dropout_rate ) ) self.output_norm = LayerNorm(args.adim) self.output_layer = torch.nn.Linear(args.adim, odim)
def __init__(self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): super(Encoder, self).__init__() if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before self.encoders = repeat( num_blocks, lambda: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after)) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def build_transformer_block( net_part: str, block: Dict[str, Any], pw_layer_type: str, pw_activation_type: str, ) -> Union[EncoderLayer, TransformerDecoderLayer]: """Build function for transformer block. Args: net_part: Network part, either 'encoder' or 'decoder'. block: Transformer block parameters. pw_layer_type: Positionwise layer type. pw_activation_type: Positionwise activation type. Returns: : Function to create transformer (encoder or decoder) block. """ d_hidden = block["d_hidden"] dropout_rate = block.get("dropout-rate", 0.0) pos_dropout_rate = block.get("pos-dropout-rate", 0.0) att_dropout_rate = block.get("att-dropout-rate", 0.0) if pw_layer_type != "linear": raise NotImplementedError( "Transformer block only supports linear pointwise layer.") if net_part == "encoder": transformer_layer_class = EncoderLayer elif net_part == "decoder": transformer_layer_class = TransformerDecoderLayer return lambda: transformer_layer_class( d_hidden, MultiHeadedAttention(block["heads"], d_hidden, att_dropout_rate), PositionwiseFeedForward( d_hidden, block["d_ff"], pos_dropout_rate, get_activation(pw_activation_type), ), dropout_rate, )
def __init__(self, idim, odim, args, ignore_id=-1, blank_id=0): """Construct an E2E object for transducer model.""" torch.nn.Module.__init__(self) if "transformer" in args.etype: if args.enc_block_arch is None: raise ValueError( "Transformer-based blocks in transducer mode should be" "defined individually in the YAML file." "See egs/vivos/asr1/conf/transducer/* for more info.") self.subsample = get_subsample(args, mode="asr", arch="transformer") # 2. use transformer to joint feature maps # transformer without positional encoding self.clayers = repeat( 2, lambda lnum: EncoderLayer( 16, MultiHeadedAttention(4, 16, 0.1), PositionwiseFeedForward(16, 2048, 0.1), dropout_rate=0.1, normalize_before=True, concat_after=False, ), ) self.conv = torch.nn.Sequential( torch.nn.Conv2d(1, 32, kernel_size=(3, 5), stride=(1, 2)), torch.nn.ReLU(), torch.nn.Conv2d(32, 32, kernel_size=(3, 7), stride=(2, 2)), torch.nn.ReLU()) self.encoder = Encoder( idim, args.enc_block_arch, input_layer=args.transformer_enc_input_layer, repeat_block=args.enc_block_repeat, self_attn_type=args.transformer_enc_self_attn_type, positional_encoding_type=args. transformer_enc_positional_encoding_type, positionwise_activation_type=args. transformer_enc_pw_activation_type, conv_mod_activation_type=args. transformer_enc_conv_mod_activation_type, ) encoder_out = self.encoder.enc_out args.eprojs = self.encoder.enc_out self.most_dom_list = args.enc_block_arch[:] else: self.subsample = get_subsample(args, mode="asr", arch="rnn-t") self.enc = encoder_for(args, idim, self.subsample) encoder_out = args.eprojs if "transformer" in args.dtype: if args.dec_block_arch is None: raise ValueError( "Transformer-based blocks in transducer mode should be" "defined individually in the YAML file." "See egs/vivos/asr1/conf/transducer/* for more info.") self.decoder = DecoderTT( odim, encoder_out, args.joint_dim, args.dec_block_arch, input_layer=args.transformer_dec_input_layer, repeat_block=args.dec_block_repeat, joint_activation_type=args.joint_activation_type, positionwise_activation_type=args. transformer_dec_pw_activation_type, dropout_rate_embed=args.dropout_rate_embed_decoder, ) if "transformer" in args.etype: self.most_dom_list += args.dec_block_arch[:] else: self.most_dom_list = args.dec_block_arch[:] else: if args.rnnt_mode == "rnnt-att": self.att = att_for(args) self.dec = DecoderRNNTAtt( args.eprojs, odim, args.dtype, args.dlayers, args.dunits, blank_id, self.att, args.dec_embed_dim, args.joint_dim, args.joint_activation_type, args.dropout_rate_decoder, args.dropout_rate_embed_decoder, ) else: self.dec = DecoderRNNT( args.eprojs, odim, args.dtype, args.dlayers, args.dunits, blank_id, args.dec_embed_dim, args.joint_dim, args.joint_activation_type, args.dropout_rate_decoder, args.dropout_rate_embed_decoder, ) if hasattr(self, "most_dom_list"): self.most_dom_dim = sorted( Counter(d["d_hidden"] for d in self.most_dom_list if "d_hidden" in d).most_common(), key=lambda x: x[0], reverse=True, )[0][0] self.etype = args.etype self.dtype = args.dtype self.rnnt_mode = args.rnnt_mode self.sos = odim - 1 self.eos = odim - 1 self.blank_id = blank_id self.ignore_id = ignore_id self.space = args.sym_space self.blank = args.sym_blank self.odim = odim self.reporter = Reporter() self.criterion = TransLoss(args.trans_type, self.blank_id) self.default_parameters(args) if args.report_cer or args.report_wer: from espnet.nets.e2e_asr_common import ErrorCalculatorTransducer if self.dtype == "transformer": decoder = self.decoder else: decoder = self.dec self.error_calculator = ErrorCalculatorTransducer( decoder, args.char_list, args.sym_space, args.sym_blank, args.report_cer, args.report_wer, ) else: self.error_calculator = None self.loss = None self.rnnlm = None
def __init__( self, odim, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, ): """Construct an Decoder object.""" torch.nn.Module.__init__(self) self._register_load_state_dict_pre_hook(_pre_hook) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before if selfattention_layer_type == "selfattn": logging.info("decoder self-attention layer type = self-attention") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv": logging.info( "decoder self-attention layer type = lightweight convolution") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, LightweightConvolution( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv2d": logging.info("decoder self-attention layer " "type = lightweight convolution 2-dimentional") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, LightweightConvolution2D( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv": logging.info( "decoder self-attention layer type = dynamic convolution") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv2d": logging.info( "decoder self-attention layer type = dynamic convolution 2-dimentional" ) self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution2D( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) self.selfattention_layer_type = selfattention_layer_type if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__( self, languages, odim_dict, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, sim_adapter=False, shared_adapter=False, use_adapters=True, fusion_languages=None, ): super().__init__(1, selfattention_layer_type, attention_dim, attention_heads, conv_wshare, conv_kernel_length, conv_usebias, linear_units, num_blocks, dropout_rate, positional_dropout_rate, self_attention_dropout_rate, src_attention_dropout_rate, input_layer, use_output_layer, pos_enc_class, normalize_before, concat_after) if input_layer == "embed": self.embed = torch.nn.ModuleDict() for lang in odim_dict.keys(): self.embed[lang] = torch.nn.Sequential( torch.nn.Embedding(odim_dict[lang], attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise NotImplementedError("only support embed embedding layer") assert self_attention_dropout_rate == src_attention_dropout_rate if selfattention_layer_type == "selfattn": logging.info("decoder self-attention layer type = self-attention") self.decoders = repeat( num_blocks, lambda lnum: AdaptiveDecoderLayer( languages, attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, torch.nn.ModuleDict({ "_".join(sorted(fusion_languages)): SimAdapter(attention_dim, self_attention_dropout_rate, fusion_languages) }) if sim_adapter else None, shared_adapter, use_adapters, ), ) else: raise NotImplementedError( "Only support self-attention decoder layer") if use_output_layer: self.output_layer = torch.nn.ModuleDict() for lang in odim_dict.keys(): self.output_layer[lang] = torch.nn.Linear( attention_dim, odim_dict[lang]) else: self.output_layer = None
def build_conformer_block( configuration: List[Dict[str, Any]], main_params: Dict[str, Any], ) -> Conformer: """Build Conformer block. Args: configuration: Conformer block configuration. main_params: Encoder main parameters. Returns: : Conformer block function. """ hidden_size = configuration["hidden_size"] linear_size = configuration["linear_size"] pos_wise_args = ( hidden_size, linear_size, configuration.get("pos_wise_dropout_rate", 0.0), main_params["pos_wise_act"], ) conv_mod_norm_class, conv_mod_norm_args = get_normalization( main_params["conv_mod_norm_type"], eps=configuration.get("conv_mod_norm_eps"), momentum=configuration.get("conv_mod_norm_momentum"), partial=configuration.get("conv_mod_norm_partial"), ) conv_mod_args = ( hidden_size, configuration["conv_mod_kernel_size"], main_params["conv_mod_act"], conv_mod_norm_class, conv_mod_norm_args, main_params["dynamic_chunk_training"], ) mult_att_args = ( configuration.get("heads", 4), hidden_size, configuration.get("att_dropout_rate", 0.0), main_params["simplified_att_score"], ) norm_class, norm_args = get_normalization( main_params["norm_type"], eps=configuration.get("norm_eps"), partial=configuration.get("norm_partial"), ) return lambda: Conformer( hidden_size, RelPositionMultiHeadedAttention(*mult_att_args), PositionwiseFeedForward(*pos_wise_args), PositionwiseFeedForward(*pos_wise_args), ConformerConvolution(*conv_mod_args), norm_class=norm_class, norm_args=norm_args, dropout_rate=configuration.get("dropout_rate", 0.0), )
def __init__(self, odim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, cross_operator=None, cross_shared=False, cross_weight_learnable=False, cross_weight=0.0): """Construct an Decoder object.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before cross_self_attn = None cross_src_attn = None if cross_operator: if 'src_' in cross_operator: # cross_src_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate) cross_src_attn = True if 'self_' in cross_operator: if cross_shared and cross_src_attn is not None: # cross_self_attn = cross_src_attn cross_self_attn = True # TODO: backward compatibility for shared self and source else: # cross_self_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate) cross_self_attn = True if 'concat' in cross_operator: cross_operator = 'concat' elif 'sum' in cross_operator: cross_operator = 'sum' else: raise NotImplementedError self.decoders = repeat( num_blocks, lambda: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, cross_self_attn=MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ) if cross_self_attn else None, cross_src_attn=MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ) if cross_src_attn else None, cross_operator=cross_operator, cross_shared=cross_shared, cross_weight_learnable=cross_weight_learnable, cross_weight=cross_weight)) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__(self, idim, odim, args, ignore_id=-1): """Construct an E2E object. :param int idim: dimension of inputs :param int odim: dimension of outputs :param Namespace args: argument Namespace containing options """ torch.nn.Module.__init__(self) if args.transformer_attn_dropout_rate is None: args.transformer_attn_dropout_rate = args.dropout_rate self.cn_encoder = Encoder( idim=idim, attention_dim=args.adim, attention_heads=args.aheads, linear_units=args.eunits, num_blocks=args.elayers, input_layer=args.transformer_input_layer, dropout_rate=args.dropout_rate, positional_dropout_rate=args.dropout_rate, attention_dropout_rate=args.transformer_attn_dropout_rate) self.en_encoder = Encoder( idim=idim, attention_dim=args.adim, attention_heads=args.aheads, linear_units=args.eunits, num_blocks=args.elayers, input_layer=args.transformer_input_layer, dropout_rate=args.dropout_rate, positional_dropout_rate=args.dropout_rate, attention_dropout_rate=args.transformer_attn_dropout_rate) # gated add module self.vectorize_lambda = args.vectorize_lambda lambda_dim = args.adim if self.vectorize_lambda else 1 self.aggregation_module = torch.nn.Sequential( torch.nn.Linear(2 * args.adim, lambda_dim), torch.nn.Sigmoid()) self.additional_encoder_layer = EncoderLayer( args.adim, MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate), PositionwiseFeedForward(args.adim, args.eunits, args.dropout_rate), args.dropout_rate, normalize_before=True, concat_after=False) self.additional_after_norm = LayerNorm(args.adim) self.decoder = Decoder( odim=odim, attention_dim=args.adim, attention_heads=args.aheads, linear_units=args.dunits, num_blocks=args.dlayers, dropout_rate=args.dropout_rate, positional_dropout_rate=args.dropout_rate, self_attention_dropout_rate=args.transformer_attn_dropout_rate, src_attention_dropout_rate=args.transformer_attn_dropout_rate) self.sos = odim - 1 self.eos = odim - 1 self.odim = odim self.ignore_id = ignore_id self.subsample = [1] self.reporter = Reporter() # self.lsm_weight = a self.criterion = LabelSmoothingLoss( self.odim, self.ignore_id, args.lsm_weight, args.transformer_length_normalized_loss) # self.verbose = args.verbose self.adim = args.adim self.mtlalpha = args.mtlalpha if args.mtlalpha > 0.0: self.ctc = CTC(odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True) else: self.ctc = None if args.report_cer or args.report_wer: from espnet.nets.e2e_asr_common import ErrorCalculator self.error_calculator = ErrorCalculator(args.char_list, args.sym_space, args.sym_blank, args.report_cer, args.report_wer) else: self.error_calculator = None self.rnnlm = None # yzl23 config self.remove_blank_in_ctc_mode = True self.reset_parameters(args) # reset params at the last logging.warning( "Model total size: {}M, requires_grad size: {}M".format( self.count_parameters(), self.count_parameters(requires_grad=True)))
def __init__(self, idim, center_len=8, left_len=0, hop_len=0, right_len=0, abs_pos=1, rel_pos=0, use_mem=1, att_type="mta", attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): super(Encoder, self).__init__() self.idim = idim self.center_len = center_len self.use_mem = use_mem != 0 self.left_len = left_len if self.use_mem: self.mem_len = left_len else: self.mem_len = 0 self.hop_len = hop_len self.right_len = right_len self.abs_pos = abs_pos != 0 self.rel_pos = rel_pos != 0 self.attention_dim = attention_dim self.attention_heads = attention_heads self.linear_units = linear_units self.dropout_rate = dropout_rate self.input_layer = input_layer self.normalize_before = normalize_before self.concat_after = concat_after self.positional_dropout_rate = positional_dropout_rate self.pos_enc_class = pos_enc_class self._generateInputLayer() self.encoders = repeat( num_blocks, lambda: EncoderLayer(n_head=attention_heads, d_model=attention_dim, d_head=attention_dim // attention_heads, att_type=att_type, ext_len=hop_len // 4, mem_len=self.mem_len // 4, tgt_len=center_len, future_len=right_len, rel_pos=rel_pos, dropout=dropout_rate, dropatt=attention_dropout_rate, pre_lnorm=normalize_before, pos_ff=PositionwiseFeedForward( attention_dim, linear_units, dropout_rate))) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__(self, odim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, cross_operator=None, cross_weight_learnable=False, cross_weight=0.0, cross_self=False, cross_src=False, cross_to_asr=True, cross_to_st=True): """Construct an Decoder object.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) self.embed_asr = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) self.embed_asr = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) self.embed_asr = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before self.dual_decoders = repeat( num_blocks, lambda: DualDecoderLayer( attention_dim, attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), cross_self_attn=MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ) if (cross_self and cross_to_st) else None, cross_self_attn_asr=MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ) if (cross_self and cross_to_asr) else None, cross_src_attn=MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ) if (cross_src and cross_to_st) else None, cross_src_attn_asr=MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ) if (cross_src and cross_to_asr) else None, dropout_rate=dropout_rate, normalize_before=normalize_before, concat_after=concat_after, cross_operator=cross_operator, cross_weight_learnable=cross_weight_learnable, cross_weight=cross_weight, cross_to_asr=cross_to_asr, cross_to_st=cross_to_st)) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) self.after_norm_asr = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) self.output_layer_asr = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None self.output_layer_asr = None