def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, ): assert check_argument_types() super().__init__() attention_dim = encoder_output_size if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(vocab_size, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(vocab_size, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise ValueError( f"only 'embed' or 'linear' is supported: {input_layer}") self.normalize_before = normalize_before self.decoders = repeat( num_blocks, lambda: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, vocab_size) else: self.output_layer = None
def __init__(self, odim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, moe_att_mode='linear'): """Construct an Decoder object.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before self.decoders = repeat( num_blocks, lambda: HANDecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate=dropout_rate, moe_att_mode=moe_att_mode, normalize_before=normalize_before, concat_after=concat_after, )) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__(self, n_head, d_model, d_head, pos_ff, att_type, dropout, dropatt, pre_lnorm, tgt_len=None, ext_len=0, mem_len=0, future_len=0, rel_pos=True): super(EncoderLayer, self).__init__() self.register_buffer('mems', None) self.n_head = n_head self.d_head = d_head self.d_model = d_model self.mem_len = mem_len self.rel_pos = rel_pos self.future_len = future_len self.tgt_len = tgt_len self.att = MultiHeadedAttention(n_head, d_model, dropatt) if att_type == "mta": self.att = MultiHeadedAttention(n_head, d_model, dropatt) elif att_type == "win": self.att = WinMultiHeadedAttention(n_head, d_model, dropatt) elif att_type == "smooth": self.att = SmoothMultiHeadedAttention(n_head, d_model, dropatt) elif att_type == "rel": self.att = RelMultiHeadedAttention(n_head, d_model, dropatt) else: raise ValueError("unknown attention type: " + att_type) self.layer = CashEncoderLayer(d_model, self.att, pos_ff, dropout, pre_lnorm, concat_after=False) self.drop = nn.Dropout(dropout) self.ext_len = ext_len self.rel_pos = rel_pos if rel_pos: self.re_pos_embed = PositionalEncoding(self.d_model, dropout) else: self.re_pos_embed = None
def __init__( self, odim, jdim, attention_dim=512, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.0, attention_dropout_rate=0.0, input_layer="embed", pos_enc_class=PositionalEncoding, blank=0, ): """Construct a Decoder object for transformer-transducer models.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate) ) else: raise NotImplementedError("only `embed` or torch.nn.Module is supported.") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, MultiHeadedAttention( attention_heads, attention_dim, attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, ), ) self.after_norm = LayerNorm(attention_dim) self.lin_enc = torch.nn.Linear(attention_dim, jdim) self.lin_dec = torch.nn.Linear(attention_dim, jdim, bias=False) self.lin_out = torch.nn.Linear(jdim, odim) self.attention_dim = attention_dim self.odim = odim self.blank = blank
def __init__(self, n_head, d_model, d_head, pos_ff, dropout, dropatt, pre_lnorm, tgt_len=None, ext_len=0, mem_len=0, future_len=0, rel_pos=True): super(EncoderLayer, self).__init__() self.register_buffer('mems', None) self.n_head = n_head self.d_head = d_head self.d_model = d_model self.mem_len = mem_len self.rel_pos = rel_pos self.future_len = future_len self.tgt_len = tgt_len self.layer = CashEncoderLayer(d_model, MultiHeadedAttention( n_head, d_model, dropatt), pos_ff, dropout, pre_lnorm, concat_after=False) self.drop = nn.Dropout(dropout) self.ext_len = ext_len
def __init__(self, idim, time_len=8, mem_len=0, ext_len=0, future_len=0, attention_type="memory", attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): super(Encoder, self).__init__() self.idim = idim self.time_len = time_len self.future_len = future_len self.attention_dim = attention_dim self.attention_heads = attention_heads self.linear_units = linear_units self.dropout_rate = dropout_rate self.input_layer = input_layer self.normalize_before = normalize_before self.concat_after = concat_after self.attention_type = attention_type self.positional_dropout_rate = positional_dropout_rate self.pos_enc_class = pos_enc_class self._generateInputLayer() if attention_type == "memory": self.encoders = repeat( num_blocks, lambda: EncoderLayerXL(n_head=attention_heads, d_model=attention_dim, d_head=attention_dim // attention_heads, ext_len=ext_len, mem_len=mem_len, future_len=future_len, dropout=dropout_rate, dropatt=attention_dropout_rate, pre_lnorm=normalize_before, pos_ff=PositionwiseFeedForward( attention_dim, linear_units, dropout_rate))) elif attention_type == "traditional": self.encoders = repeat( num_blocks, lambda: EncoderLayerTD( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after)) else: ValueError("only memory or traditional can be used") if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__(self, idim, args): super(Encoder, self).__init__() if args.transformer_input_layer == "linear": self.input_layer = torch.nn.Sequential( torch.nn.Linear(idim, args.adim), torch.nn.LayerNorm(args.adim), torch.nn.Dropout(args.dropout_rate), torch.nn.ReLU(), PositionalEncoding(args.adim, args.dropout_rate)) elif args.transformer_input_layer == "conv2d": self.input_layer = Conv2dSubsampling(idim, args.adim, args.dropout_rate) elif args.transformer_input_layer == "embed": self.input_layer = torch.nn.Sequential( torch.nn.Embedding(idim, args.adim), PositionalEncoding(args.adim, args.dropout_rate)) else: raise ValueError("unknown input_layer: " + args.transformer_input_layer) self.encoders = repeat( args.elayers, lambda: EncoderLayer( args.adim, MultiHeadedAttention(args.aheads, args.adim, args. transformer_attn_dropout_rate), PositionwiseFeedForward(args.adim, args.eunits, args. dropout_rate), args.dropout_rate)) self.norm = LayerNorm(args.adim)
def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, conv_wshare: int = 4, conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11), conv_usebias: int = False, ): assert check_argument_types() if len(conv_kernel_length) != num_blocks: raise ValueError( "conv_kernel_length must have equal number of values to num_blocks: " f"{len(conv_kernel_length)} != {num_blocks}") super().__init__( vocab_size=vocab_size, encoder_output_size=encoder_output_size, dropout_rate=dropout_rate, positional_dropout_rate=positional_dropout_rate, input_layer=input_layer, use_output_layer=use_output_layer, pos_enc_class=pos_enc_class, normalize_before=normalize_before, ) attention_dim = encoder_output_size self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution2D( wshare=conv_wshare, n_feat=attention_dim, dropout_rate=self_attention_dropout_rate, kernel_size=conv_kernel_length[lnum], use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), )
def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, ): assert check_argument_types() super().__init__( vocab_size=vocab_size, encoder_output_size=encoder_output_size, dropout_rate=dropout_rate, positional_dropout_rate=positional_dropout_rate, input_layer=input_layer, use_output_layer=use_output_layer, pos_enc_class=pos_enc_class, normalize_before=normalize_before, ) attention_dim = encoder_output_size self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), )
def __init__(self, odim, args): super(Decoder, self).__init__() self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, args.adim), PositionalEncoding(args.adim, args.dropout_rate) ) self.decoders = repeat( args.dlayers, lambda: DecoderLayer( args.adim, MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate), MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate), PositionwiseFeedForward(args.adim, args.dunits, args.dropout_rate), args.dropout_rate ) ) self.output_norm = LayerNorm(args.adim) self.output_layer = torch.nn.Linear(args.adim, odim)
def build_transformer_block( net_part: str, block_arch: Dict, pw_layer_type: str, pw_activation_type: str ) -> Union[EncoderLayer, TransformerDecoderLayer]: """Build function for transformer block. Args: net_part: Network part, either 'encoder' or 'decoder'. block_arch: Transformer block parameters. pw_layer_type: Positionwise layer type. pw_activation_type: Positionwise activation type. Returns: : Function to create transformer (encoder or decoder) block. """ d_hidden = block_arch["d_hidden"] d_ff = block_arch["d_ff"] heads = block_arch["heads"] dropout_rate = block_arch["dropout-rate"] if "dropout-rate" in block_arch else 0.0 pos_dropout_rate = ( block_arch["pos-dropout-rate"] if "pos-dropout-rate" in block_arch else 0.0 ) att_dropout_rate = ( block_arch["att-dropout-rate"] if "att-dropout-rate" in block_arch else 0.0 ) if pw_layer_type == "linear": pw_layer = PositionwiseFeedForward pw_activation = get_activation(pw_activation_type) pw_layer_args = (d_hidden, d_ff, pos_dropout_rate, pw_activation) else: raise NotImplementedError("Transformer block only supports linear yet.") if net_part == "encoder": transformer_layer_class = EncoderLayer elif net_part == "decoder": transformer_layer_class = TransformerDecoderLayer return lambda: transformer_layer_class( d_hidden, MultiHeadedAttention(heads, d_hidden, att_dropout_rate), pw_layer(*pw_layer_args), dropout_rate, )
def build_transformer_block( net_part: str, block: Dict[str, Any], pw_layer_type: str, pw_activation_type: str, ) -> Union[EncoderLayer, TransformerDecoderLayer]: """Build function for transformer block. Args: net_part: Network part, either 'encoder' or 'decoder'. block: Transformer block parameters. pw_layer_type: Positionwise layer type. pw_activation_type: Positionwise activation type. Returns: : Function to create transformer (encoder or decoder) block. """ d_hidden = block["d_hidden"] dropout_rate = block.get("dropout-rate", 0.0) pos_dropout_rate = block.get("pos-dropout-rate", 0.0) att_dropout_rate = block.get("att-dropout-rate", 0.0) if pw_layer_type != "linear": raise NotImplementedError( "Transformer block only supports linear pointwise layer.") if net_part == "encoder": transformer_layer_class = EncoderLayer elif net_part == "decoder": transformer_layer_class = TransformerDecoderLayer return lambda: transformer_layer_class( d_hidden, MultiHeadedAttention(block["heads"], d_hidden, att_dropout_rate), PositionwiseFeedForward( d_hidden, block["d_ff"], pos_dropout_rate, get_activation(pw_activation_type), ), dropout_rate, )
def __init__(self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): super(Encoder, self).__init__() if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before self.encoders = repeat( num_blocks, lambda: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after)) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def build_transformer_block(net_part, block_arch, pw_layer_type, pw_activation_type): """Build function for transformer block. Args: net_part (str): either 'encoder' or 'decoder' block_arch (dict): transformer block parameters pw_layer_type (str): positionwise layer type pw_activation_type (str): positionwise activation type Returns: (function): function to create transformer block """ d_hidden = block_arch["d_hidden"] d_ff = block_arch["d_ff"] heads = block_arch["heads"] dropout_rate = block_arch[ "dropout-rate"] if "dropout-rate" in block_arch else 0.0 pos_dropout_rate = (block_arch["pos-dropout-rate"] if "pos-dropout-rate" in block_arch else 0.0) att_dropout_rate = (block_arch["att-dropout-rate"] if "att-dropout-rate" in block_arch else 0.0) if pw_layer_type == "linear": pw_layer = PositionwiseFeedForward pw_activation = get_activation(pw_activation_type) pw_layer_args = (d_hidden, d_ff, pos_dropout_rate, pw_activation) else: raise NotImplementedError( "Transformer block only supports linear yet.") if net_part == "encoder": transformer_layer_class = EncoderLayer elif net_part == "decoder": transformer_layer_class = DecoderLayer return lambda: transformer_layer_class( d_hidden, MultiHeadedAttention(heads, d_hidden, att_dropout_rate), pw_layer(*pw_layer_args), dropout_rate, )
def __init__(self, hparams, window_sizes=[100, 50], channels=[128, 64, 32], dropout_rate=0.3): super(Wgan_GP, self).__init__() self.hparams = hparams self.window_sizes = window_sizes self.channels = channels self.convs = torch.nn.ModuleList() self.smooth_dense_layer = torch.nn.ModuleList() for k in range(len(channels)): self.convs_k = torch.nn.Sequential( Conv2Norm(in_channels=1, out_channels=channels[k], kernel_size=(3, 3), bias=False, w_init_gain='leaky_relu'), torch.nn.BatchNorm2d(channels[k]), torch.nn.ReLU(), Conv2Norm(in_channels=channels[k], out_channels=channels[k], kernel_size=(3, 3), bias=False, w_init_gain='leaky_relu'), torch.nn.BatchNorm2d(channels[k]), torch.nn.ReLU(), Conv2Norm(in_channels=channels[k], out_channels=channels[k], kernel_size=(3, 3), bias=False, w_init_gain='leaky_relu'), torch.nn.BatchNorm2d(channels[k]), torch.nn.ReLU(), torch.nn.Dropout(dropout_rate)) self.dense_k = torch.nn.Linear(channels[k] * hparams.num_mels, 32) self.convs.append(self.convs_k) self.smooth_dense_layer.append(self.dense_k) self.multihead_attention = MultiHeadedAttention( hparams.aheads, 32, hparams.transformer_enc_dropout_rate) self.smooth_dense_layer_final = torch.nn.Linear(32, 1)
def __init__( self, odim, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, ): """Construct an Decoder object.""" torch.nn.Module.__init__(self) self._register_load_state_dict_pre_hook(_pre_hook) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before if selfattention_layer_type == "selfattn": logging.info("decoder self-attention layer type = self-attention") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv": logging.info( "decoder self-attention layer type = lightweight convolution") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, LightweightConvolution( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv2d": logging.info("decoder self-attention layer " "type = lightweight convolution 2-dimentional") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, LightweightConvolution2D( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv": logging.info( "decoder self-attention layer type = dynamic convolution") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv2d": logging.info( "decoder self-attention layer type = dynamic convolution 2-dimentional" ) self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution2D( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) self.selfattention_layer_type = selfattention_layer_type if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__( self, idim, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1, ): """Construct an Encoder object.""" super(Encoder, self).__init__() self._register_load_state_dict_pre_hook(_pre_hook) self.conv_subsampling_factor = 1 if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) self.conv_subsampling_factor = 4 elif input_layer == "conv2d-scaled-pos-enc": self.embed = Conv2dSubsampling( idim, attention_dim, dropout_rate, pos_enc_class(attention_dim, positional_dropout_rate), ) self.conv_subsampling_factor = 4 elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6(idim, attention_dim, dropout_rate) self.conv_subsampling_factor = 6 elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8(idim, attention_dim, dropout_rate) self.conv_subsampling_factor = 8 elif input_layer == "vgg2l": self.embed = VGG2L(idim, attention_dim) self.conv_subsampling_factor = 4 elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before positionwise_layer, positionwise_layer_args = self.get_positionwise_layer( positionwise_layer_type, attention_dim, linear_units, dropout_rate, positionwise_conv_kernel_size, ) if selfattention_layer_type == "selfattn": logging.info("encoder self-attention layer type = self-attention") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv": logging.info( "encoder self-attention layer type = lightweight convolution") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, LightweightConvolution( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv2d": logging.info("encoder self-attention layer " "type = lightweight convolution 2-dimentional") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, LightweightConvolution2D( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv": logging.info( "encoder self-attention layer type = dynamic convolution") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, DynamicConvolution( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv2d": logging.info( "encoder self-attention layer type = dynamic convolution 2-dimentional" ) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, DynamicConvolution2D( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, idim, pred_into_type, into_type_num, reduce_character_embedding, attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=3, dropout_rate=0.2, positional_dropout_rate=0.1, attention_dropout_rate=0.0, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_conv_kernel_size=1, padding_idx=-1, elayers=None, eunits=None, ): """Construct an Encoder object.""" super(SentenceEncoder, self).__init__() self.conv_subsampling_factor = 1 self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) self.normalize_before = normalize_before positionwise_layer = PositionwiseFeedForward positionwise_layer_args = (attention_dim, linear_units, dropout_rate) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) # For reduction self.reduce_character_embedding = reduce_character_embedding self.query = None # For embedding reduction if reduce_character_embedding or pred_into_type: query = torch.nn.Parameter(torch.FloatTensor((attention_dim)), requires_grad=True) self.query = torch.nn.init.uniform_(query) # self.d_k = math.sqrt(eunits) self.K = torch.nn.Linear(attention_dim, attention_dim) # self.V = torch.nn.Linear(eunits, eunits) self.score_dropout = torch.nn.Dropout(p=dropout_rate) # For prediction self.pred_prj = None if pred_into_type: self.pred_prj = torch.nn.Linear(attention_dim, into_type_num)
def __init__( self, languages, odim_dict, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, sim_adapter=False, shared_adapter=False, use_adapters=True, fusion_languages=None, ): super().__init__(1, selfattention_layer_type, attention_dim, attention_heads, conv_wshare, conv_kernel_length, conv_usebias, linear_units, num_blocks, dropout_rate, positional_dropout_rate, self_attention_dropout_rate, src_attention_dropout_rate, input_layer, use_output_layer, pos_enc_class, normalize_before, concat_after) if input_layer == "embed": self.embed = torch.nn.ModuleDict() for lang in odim_dict.keys(): self.embed[lang] = torch.nn.Sequential( torch.nn.Embedding(odim_dict[lang], attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise NotImplementedError("only support embed embedding layer") assert self_attention_dropout_rate == src_attention_dropout_rate if selfattention_layer_type == "selfattn": logging.info("decoder self-attention layer type = self-attention") self.decoders = repeat( num_blocks, lambda lnum: AdaptiveDecoderLayer( languages, attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, torch.nn.ModuleDict({ "_".join(sorted(fusion_languages)): SimAdapter(attention_dim, self_attention_dropout_rate, fusion_languages) }) if sim_adapter else None, shared_adapter, use_adapters, ), ) else: raise NotImplementedError( "Only support self-attention decoder layer") if use_output_layer: self.output_layer = torch.nn.ModuleDict() for lang in odim_dict.keys(): self.output_layer[lang] = torch.nn.Linear( attention_dim, odim_dict[lang]) else: self.output_layer = None
def __init__( self, languages, idim, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1, sim_adapter=False, shared_adapter=None, use_adapters=True, fusion_languages=None, ): super().__init__(idim, selfattention_layer_type, attention_dim, attention_heads, conv_wshare, conv_kernel_length, conv_usebias, linear_units, num_blocks, dropout_rate, positional_dropout_rate, attention_dropout_rate, input_layer, pos_enc_class, normalize_before, concat_after, positionwise_layer_type, positionwise_conv_kernel_size, padding_idx) positionwise_layer, positionwise_layer_args = self.get_positionwise_layer( positionwise_layer_type, attention_dim, linear_units, dropout_rate, positionwise_conv_kernel_size, ) if selfattention_layer_type == "selfattn": logging.info("encoder self-attention layer type = self-attention") self.encoders = repeat( num_blocks, lambda lnum: AdaptiveEncoderLayer( languages, attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, torch.nn.ModuleDict({ "_".join(sorted(fusion_languages)): SimAdapter(attention_dim, attention_dropout_rate, fusion_languages) }) if sim_adapter else None, shared_adapter, use_adapters, ), ) else: raise NotImplementedError( "Only support self-attention encoder layer")
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: Optional[str] = "conv2d", pos_enc_class=StreamPositionalEncoding, normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 1, padding_idx: int = -1, block_size: int = 40, hop_size: int = 16, look_ahead: int = 16, init_average: bool = True, ctx_pos_enc: bool = True, ): assert check_argument_types() super().__init__() self._output_size = output_size self.pos_enc = pos_enc_class(output_size, positional_dropout_rate) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), ) self.subsample = 1 elif input_layer == "conv2d": self.embed = Conv2dSubsamplingWOPosEnc(input_size, output_size, dropout_rate, kernels=[3, 3], strides=[2, 2]) self.subsample = 4 elif input_layer == "conv2d6": self.embed = Conv2dSubsamplingWOPosEnc(input_size, output_size, dropout_rate, kernels=[3, 5], strides=[2, 3]) self.subsample = 6 elif input_layer == "conv2d8": self.embed = Conv2dSubsamplingWOPosEnc( input_size, output_size, dropout_rate, kernels=[3, 3, 3], strides=[2, 2, 2], ) self.subsample = 8 elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), ) self.subsample = 1 elif input_layer is None: self.embed = None self.subsample = 1 else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") self.encoders = repeat( num_blocks, lambda lnum: ContextualBlockEncoderLayer( output_size, MultiHeadedAttention(attention_heads, output_size, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, num_blocks, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size) # for block processing self.block_size = block_size self.hop_size = hop_size self.look_ahead = look_ahead self.init_average = init_average self.ctx_pos_enc = ctx_pos_enc
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: Optional[str] = "conv2d", pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 1, padding_idx: int = -1, ): assert check_argument_types() super().__init__() self._output_size = output_size if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(input_size, output_size, dropout_rate) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6(input_size, output_size, dropout_rate) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8(input_size, output_size, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate) ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( output_size, MultiHeadedAttention( attention_heads, output_size, attention_dropout_rate ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size)
def __init__(self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1): super(Encoder, self).__init__() if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate) ) elif input_layer == "custom": self.embed = EncoderConv2d(idim, attention_dim) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate) ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate) ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = (attention_dim, linear_units, dropout_rate) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = (attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate) else: raise NotImplementedError("Support only linear or conv1d.") self.encoders = repeat( num_blocks, lambda: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after ) ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__(self, odim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, cross_operator=None, cross_shared=False, cross_weight_learnable=False, cross_weight=0.0): """Construct an Decoder object.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before cross_self_attn = None cross_src_attn = None if cross_operator: if 'src_' in cross_operator: # cross_src_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate) cross_src_attn = True if 'self_' in cross_operator: if cross_shared and cross_src_attn is not None: # cross_self_attn = cross_src_attn cross_self_attn = True # TODO: backward compatibility for shared self and source else: # cross_self_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate) cross_self_attn = True if 'concat' in cross_operator: cross_operator = 'concat' elif 'sum' in cross_operator: cross_operator = 'sum' else: raise NotImplementedError self.decoders = repeat( num_blocks, lambda: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, cross_self_attn=MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ) if cross_self_attn else None, cross_src_attn=MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ) if cross_src_attn else None, cross_operator=cross_operator, cross_shared=cross_shared, cross_weight_learnable=cross_weight_learnable, cross_weight=cross_weight)) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__(self): super().__init__() self.att1 = MultiHeadedAttention(2, 10, 0.0) self.att2 = AttAdd(10, 20, 15) self.desired = defaultdict(list)
def __init__( self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1, ): """Construct an Encoder object.""" super(Encoder, self).__init__() if input_layer == "custom": self.embed = EncoderConv2d(idim, attention_dim) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = (attention_dim, linear_units, dropout_rate) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") self.encoders = repeat( num_blocks, lambda: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks_sd=4, num_blocks_rec=8, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1, num_spkrs=2, ): """Construct an Encoder object.""" super(EncoderMix, self).__init__( idim=idim, selfattention_layer_type="selfattn", attention_dim=attention_dim, attention_heads=attention_heads, linear_units=linear_units, num_blocks=num_blocks_rec, dropout_rate=dropout_rate, positional_dropout_rate=positional_dropout_rate, attention_dropout_rate=attention_dropout_rate, input_layer=input_layer, pos_enc_class=pos_enc_class, normalize_before=normalize_before, concat_after=concat_after, positionwise_layer_type=positionwise_layer_type, positionwise_conv_kernel_size=positionwise_conv_kernel_size, padding_idx=padding_idx, ) positionwise_layer, positionwise_layer_args = self.get_positionwise_layer( positionwise_layer_type, attention_dim, linear_units, dropout_rate, positionwise_conv_kernel_size, ) self.num_spkrs = num_spkrs self.encoders_sd = torch.nn.ModuleList([ repeat( num_blocks_sd, lambda lnum: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) for i in range(num_spkrs) ])
def __init__(self, idim, odim, args, ignore_id=-1, blank_id=0): """Construct an E2E object for transducer model.""" torch.nn.Module.__init__(self) if "transformer" in args.etype: if args.enc_block_arch is None: raise ValueError( "Transformer-based blocks in transducer mode should be" "defined individually in the YAML file." "See egs/vivos/asr1/conf/transducer/* for more info.") self.subsample = get_subsample(args, mode="asr", arch="transformer") # 2. use transformer to joint feature maps # transformer without positional encoding self.clayers = repeat( 2, lambda lnum: EncoderLayer( 16, MultiHeadedAttention(4, 16, 0.1), PositionwiseFeedForward(16, 2048, 0.1), dropout_rate=0.1, normalize_before=True, concat_after=False, ), ) self.conv = torch.nn.Sequential( torch.nn.Conv2d(1, 32, kernel_size=(3, 5), stride=(1, 2)), torch.nn.ReLU(), torch.nn.Conv2d(32, 32, kernel_size=(3, 7), stride=(2, 2)), torch.nn.ReLU()) self.encoder = Encoder( idim, args.enc_block_arch, input_layer=args.transformer_enc_input_layer, repeat_block=args.enc_block_repeat, self_attn_type=args.transformer_enc_self_attn_type, positional_encoding_type=args. transformer_enc_positional_encoding_type, positionwise_activation_type=args. transformer_enc_pw_activation_type, conv_mod_activation_type=args. transformer_enc_conv_mod_activation_type, ) encoder_out = self.encoder.enc_out args.eprojs = self.encoder.enc_out self.most_dom_list = args.enc_block_arch[:] else: self.subsample = get_subsample(args, mode="asr", arch="rnn-t") self.enc = encoder_for(args, idim, self.subsample) encoder_out = args.eprojs if "transformer" in args.dtype: if args.dec_block_arch is None: raise ValueError( "Transformer-based blocks in transducer mode should be" "defined individually in the YAML file." "See egs/vivos/asr1/conf/transducer/* for more info.") self.decoder = DecoderTT( odim, encoder_out, args.joint_dim, args.dec_block_arch, input_layer=args.transformer_dec_input_layer, repeat_block=args.dec_block_repeat, joint_activation_type=args.joint_activation_type, positionwise_activation_type=args. transformer_dec_pw_activation_type, dropout_rate_embed=args.dropout_rate_embed_decoder, ) if "transformer" in args.etype: self.most_dom_list += args.dec_block_arch[:] else: self.most_dom_list = args.dec_block_arch[:] else: if args.rnnt_mode == "rnnt-att": self.att = att_for(args) self.dec = DecoderRNNTAtt( args.eprojs, odim, args.dtype, args.dlayers, args.dunits, blank_id, self.att, args.dec_embed_dim, args.joint_dim, args.joint_activation_type, args.dropout_rate_decoder, args.dropout_rate_embed_decoder, ) else: self.dec = DecoderRNNT( args.eprojs, odim, args.dtype, args.dlayers, args.dunits, blank_id, args.dec_embed_dim, args.joint_dim, args.joint_activation_type, args.dropout_rate_decoder, args.dropout_rate_embed_decoder, ) if hasattr(self, "most_dom_list"): self.most_dom_dim = sorted( Counter(d["d_hidden"] for d in self.most_dom_list if "d_hidden" in d).most_common(), key=lambda x: x[0], reverse=True, )[0][0] self.etype = args.etype self.dtype = args.dtype self.rnnt_mode = args.rnnt_mode self.sos = odim - 1 self.eos = odim - 1 self.blank_id = blank_id self.ignore_id = ignore_id self.space = args.sym_space self.blank = args.sym_blank self.odim = odim self.reporter = Reporter() self.criterion = TransLoss(args.trans_type, self.blank_id) self.default_parameters(args) if args.report_cer or args.report_wer: from espnet.nets.e2e_asr_common import ErrorCalculatorTransducer if self.dtype == "transformer": decoder = self.decoder else: decoder = self.dec self.error_calculator = ErrorCalculatorTransducer( decoder, args.char_list, args.sym_space, args.sym_blank, args.report_cer, args.report_wer, ) else: self.error_calculator = None self.loss = None self.rnnlm = None
def __init__(self, idim, odim, args, ignore_id=-1): """Construct an E2E object. :param int idim: dimension of inputs :param int odim: dimension of outputs :param Namespace args: argument Namespace containing options """ torch.nn.Module.__init__(self) if args.transformer_attn_dropout_rate is None: args.transformer_attn_dropout_rate = args.dropout_rate self.encoder = Encoder( idim=idim, attention_dim=args.adim, attention_heads=args.aheads, linear_units=args.eunits, num_blocks=args.elayers, input_layer=args.transformer_input_layer, dropout_rate=args.dropout_rate, positional_dropout_rate=args.dropout_rate, attention_dropout_rate=args.transformer_attn_dropout_rate) self.decoder = Decoder( odim=odim, attention_dim=args.adim, attention_heads=args.aheads, linear_units=args.dunits, num_blocks=args.dlayers, dropout_rate=args.dropout_rate, positional_dropout_rate=args.dropout_rate, self_attention_dropout_rate=args.transformer_attn_dropout_rate, src_attention_dropout_rate=args.transformer_attn_dropout_rate) self.sos = odim - 1 self.eos = odim - 1 self.odim = odim self.ignore_id = ignore_id self.subsample = [1] self.reporter = Reporter() # self.lsm_weight = a self.criterion = LabelSmoothingLoss( self.odim, self.ignore_id, args.lsm_weight, args.transformer_length_normalized_loss) # self.verbose = args.verbose self.adim = args.adim self.mtlalpha = args.mtlalpha if args.mtlalpha > 0.0: self.ctc = CTC(odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True) else: self.ctc = None if args.report_cer or args.report_wer: from espnet.nets.e2e_asr_common import ErrorCalculator self.error_calculator = ErrorCalculator(args.char_list, args.sym_space, args.sym_blank, args.report_cer, args.report_wer) else: self.error_calculator = None self.rnnlm = None # yzl23 config self.remove_blank_in_ctc_mode = True # lid multitask related adim = args.adim self.lid_odim = 2 # cn and en # src attention self.lid_src_att = MultiHeadedAttention( args.aheads, args.adim, args.transformer_attn_dropout_rate) # self.lid_output_layer = torch.nn.Sequential(torch.nn.Linear(adim, adim), # torch.nn.Tanh(), # torch.nn.Linear(adim, self.lid_odim)) self.lid_output_layer = torch.nn.Linear(adim, self.lid_odim) # here we hack to use lsm loss, but with lsm_weight ZERO self.lid_criterion = LanguageIDMultitakLoss(self.ignore_id, \ normalize_length=args.transformer_length_normalized_loss) self.lid_mtl_alpha = args.lid_mtl_alpha logging.warning("language id multitask training alpha %f" % (self.lid_mtl_alpha)) self.log_lid_mtl_acc = args.log_lid_mtl_acc # reset parameters self.reset_parameters(args)
def __init__(self, idim, odim, args, ignore_id=-1): """Construct an E2E object. :param int idim: dimension of inputs :param int odim: dimension of outputs :param Namespace args: argument Namespace containing options """ torch.nn.Module.__init__(self) if args.transformer_attn_dropout_rate is None: args.transformer_attn_dropout_rate = args.dropout_rate self.cn_encoder = Encoder( idim=idim, attention_dim=args.adim, attention_heads=args.aheads, linear_units=args.eunits, num_blocks=args.elayers, input_layer=args.transformer_input_layer, dropout_rate=args.dropout_rate, positional_dropout_rate=args.dropout_rate, attention_dropout_rate=args.transformer_attn_dropout_rate) self.en_encoder = Encoder( idim=idim, attention_dim=args.adim, attention_heads=args.aheads, linear_units=args.eunits, num_blocks=args.elayers, input_layer=args.transformer_input_layer, dropout_rate=args.dropout_rate, positional_dropout_rate=args.dropout_rate, attention_dropout_rate=args.transformer_attn_dropout_rate) # gated add module self.vectorize_lambda = args.vectorize_lambda lambda_dim = args.adim if self.vectorize_lambda else 1 self.aggregation_module = torch.nn.Sequential( torch.nn.Linear(2 * args.adim, lambda_dim), torch.nn.Sigmoid()) self.additional_encoder_layer = EncoderLayer( args.adim, MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate), PositionwiseFeedForward(args.adim, args.eunits, args.dropout_rate), args.dropout_rate, normalize_before=True, concat_after=False) self.additional_after_norm = LayerNorm(args.adim) self.decoder = Decoder( odim=odim, attention_dim=args.adim, attention_heads=args.aheads, linear_units=args.dunits, num_blocks=args.dlayers, dropout_rate=args.dropout_rate, positional_dropout_rate=args.dropout_rate, self_attention_dropout_rate=args.transformer_attn_dropout_rate, src_attention_dropout_rate=args.transformer_attn_dropout_rate) self.sos = odim - 1 self.eos = odim - 1 self.odim = odim self.ignore_id = ignore_id self.subsample = [1] self.reporter = Reporter() # self.lsm_weight = a self.criterion = LabelSmoothingLoss( self.odim, self.ignore_id, args.lsm_weight, args.transformer_length_normalized_loss) # self.verbose = args.verbose self.adim = args.adim self.mtlalpha = args.mtlalpha if args.mtlalpha > 0.0: self.ctc = CTC(odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True) else: self.ctc = None if args.report_cer or args.report_wer: from espnet.nets.e2e_asr_common import ErrorCalculator self.error_calculator = ErrorCalculator(args.char_list, args.sym_space, args.sym_blank, args.report_cer, args.report_wer) else: self.error_calculator = None self.rnnlm = None # yzl23 config self.remove_blank_in_ctc_mode = True self.reset_parameters(args) # reset params at the last logging.warning( "Model total size: {}M, requires_grad size: {}M".format( self.count_parameters(), self.count_parameters(requires_grad=True)))