def __init__(self, idim, time_len=8, mem_len=0, ext_len=0, future_len=0, attention_type="memory", attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): super(Encoder, self).__init__() self.idim = idim self.time_len = time_len self.future_len = future_len self.attention_dim = attention_dim self.attention_heads = attention_heads self.linear_units = linear_units self.dropout_rate = dropout_rate self.input_layer = input_layer self.normalize_before = normalize_before self.concat_after = concat_after self.attention_type = attention_type self.positional_dropout_rate = positional_dropout_rate self.pos_enc_class = pos_enc_class self._generateInputLayer() if attention_type == "memory": self.encoders = repeat( num_blocks, lambda: EncoderLayerXL(n_head=attention_heads, d_model=attention_dim, d_head=attention_dim // attention_heads, ext_len=ext_len, mem_len=mem_len, future_len=future_len, dropout=dropout_rate, dropatt=attention_dropout_rate, pre_lnorm=normalize_before, pos_ff=PositionwiseFeedForward( attention_dim, linear_units, dropout_rate))) elif attention_type == "traditional": self.encoders = repeat( num_blocks, lambda: EncoderLayerTD( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after)) else: ValueError("only memory or traditional can be used") if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, odim, jdim, attention_dim=512, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.0, attention_dropout_rate=0.0, input_layer="embed", pos_enc_class=PositionalEncoding, blank=0, ): """Construct a Decoder object for transformer-transducer models.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate) ) else: raise NotImplementedError("only `embed` or torch.nn.Module is supported.") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, MultiHeadedAttention( attention_heads, attention_dim, attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, ), ) self.after_norm = LayerNorm(attention_dim) self.lin_enc = torch.nn.Linear(attention_dim, jdim) self.lin_dec = torch.nn.Linear(attention_dim, jdim, bias=False) self.lin_out = torch.nn.Linear(jdim, odim) self.attention_dim = attention_dim self.odim = odim self.blank = blank
def __init__(self, idim, args): super(Encoder, self).__init__() if args.transformer_input_layer == "linear": self.input_layer = torch.nn.Sequential( torch.nn.Linear(idim, args.adim), torch.nn.LayerNorm(args.adim), torch.nn.Dropout(args.dropout_rate), torch.nn.ReLU(), PositionalEncoding(args.adim, args.dropout_rate)) elif args.transformer_input_layer == "conv2d": self.input_layer = Conv2dSubsampling(idim, args.adim, args.dropout_rate) elif args.transformer_input_layer == "embed": self.input_layer = torch.nn.Sequential( torch.nn.Embedding(idim, args.adim), PositionalEncoding(args.adim, args.dropout_rate)) else: raise ValueError("unknown input_layer: " + args.transformer_input_layer) self.encoders = repeat( args.elayers, lambda: EncoderLayer( args.adim, MultiHeadedAttention(args.aheads, args.adim, args. transformer_attn_dropout_rate), PositionwiseFeedForward(args.adim, args.eunits, args. dropout_rate), args.dropout_rate)) self.norm = LayerNorm(args.adim)
def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, ): assert check_argument_types() super().__init__() attention_dim = encoder_output_size if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(vocab_size, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(vocab_size, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise ValueError( f"only 'embed' or 'linear' is supported: {input_layer}") self.normalize_before = normalize_before self.decoders = repeat( num_blocks, lambda: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, vocab_size) else: self.output_layer = None
def __init__(self, idim=83, d_model=256, n_heads=4, d_ffn=2048, layers=6, kernel_size=32, dropout_rate=0.1, input_layer="conv2d", padding_idx=-1, causal=False): super(Encoder, self).__init__() if input_layer == "custom": self.embed = EncoderConv2d(idim, d_model) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, d_model, dropout_rate) else: raise ValueError("unknown input_layer: " + input_layer) self.pos_emb = RelativePositionalEncoding(d_model // n_heads, 1000, False) self.encoders = repeat( layers, lambda: EncoderLayer(d_model, n_heads, d_ffn, kernel_size, dropout_rate, causal))
def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, conv_wshare: int = 4, conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11), conv_usebias: int = False, ): assert check_argument_types() if len(conv_kernel_length) != num_blocks: raise ValueError( "conv_kernel_length must have equal number of values to num_blocks: " f"{len(conv_kernel_length)} != {num_blocks}") super().__init__( vocab_size=vocab_size, encoder_output_size=encoder_output_size, dropout_rate=dropout_rate, positional_dropout_rate=positional_dropout_rate, input_layer=input_layer, use_output_layer=use_output_layer, pos_enc_class=pos_enc_class, normalize_before=normalize_before, ) attention_dim = encoder_output_size self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution2D( wshare=conv_wshare, n_feat=attention_dim, dropout_rate=self_attention_dropout_rate, kernel_size=conv_kernel_length[lnum], use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), )
def __init__(self, odim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, moe_att_mode='linear'): """Construct an Decoder object.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before self.decoders = repeat( num_blocks, lambda: HANDecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate=dropout_rate, moe_att_mode=moe_att_mode, normalize_before=normalize_before, concat_after=concat_after, )) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, ): assert check_argument_types() super().__init__( vocab_size=vocab_size, encoder_output_size=encoder_output_size, dropout_rate=dropout_rate, positional_dropout_rate=positional_dropout_rate, input_layer=input_layer, use_output_layer=use_output_layer, pos_enc_class=pos_enc_class, normalize_before=normalize_before, ) attention_dim = encoder_output_size self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), )
def __init__(self, odim, args): super(Decoder, self).__init__() self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, args.adim), PositionalEncoding(args.adim, args.dropout_rate) ) self.decoders = repeat( args.dlayers, lambda: DecoderLayer( args.adim, MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate), MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate), PositionwiseFeedForward(args.adim, args.dunits, args.dropout_rate), args.dropout_rate ) ) self.output_norm = LayerNorm(args.adim) self.output_layer = torch.nn.Linear(args.adim, odim)
def __init__(self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): super(Encoder, self).__init__() if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before self.encoders = repeat( num_blocks, lambda: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after)) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__(self, idim, time_len, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="linear", pos_enc_class=None, normalize_before=True, concat_after=False): super(Encoder, self).__init__() self.idim = idim self.attention_dim = attention_dim self.attention_heads = attention_heads self.linear_units = linear_units self.dropout_rate = dropout_rate self.positional_dropout_rate = positional_dropout_rate self.input_layer = input_layer self._generateInputLayer() self.normalize_before = normalize_before self.concat_after= concat_after self.encoders = repeat( num_blocks, lambda: EncoderLayer( n_head = attention_heads, d_model = attention_dim, d_head = attention_dim//attention_heads, d_inner = attention_dim, tgt_len = time_len, ext_len = 0, mem_len = time_len, dropout = dropout_rate, dropatt = attention_dropout_rate, pre_lnorm= normalize_before ) ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__(self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1): super(Encoder, self).__init__() if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate) ) elif input_layer == "custom": self.embed = EncoderConv2d(idim, attention_dim) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate) ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate) ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = (attention_dim, linear_units, dropout_rate) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = (attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate) else: raise NotImplementedError("Support only linear or conv1d.") self.encoders = repeat( num_blocks, lambda: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after ) ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, macaron_style=False, pos_enc_layer_type="abs_pos", selfattention_layer_type="selfattn", activation_type="swish", use_cnn_module=False, cnn_module_kernel=31, padding_idx=-1, ): """Construct an Encoder object.""" super(Encoder, self).__init__() activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( idim, attention_dim, dropout_rate, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "vgg2l": self.embed = VGG2L(idim, attention_dim) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( attention_dim, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") if selfattention_layer_type == "selfattn": logging.info("encoder self-attention layer type = self-attention") encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, attention_dim, attention_dropout_rate, ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, attention_dim, attention_dropout_rate, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) convolution_layer = ConvolutionModule convolution_layer_args = (attention_dim, cnn_module_kernel, activation) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, odim, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, ): """Construct an Decoder object.""" torch.nn.Module.__init__(self) self._register_load_state_dict_pre_hook(_pre_hook) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before if selfattention_layer_type == "selfattn": logging.info("decoder self-attention layer type = self-attention") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv": logging.info( "decoder self-attention layer type = lightweight convolution") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, LightweightConvolution( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv2d": logging.info("decoder self-attention layer " "type = lightweight convolution 2-dimentional") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, LightweightConvolution2D( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv": logging.info( "decoder self-attention layer type = dynamic convolution") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv2d": logging.info( "decoder self-attention layer type = dynamic convolution 2-dimentional" ) self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution2D( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) self.selfattention_layer_type = selfattention_layer_type if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: str = "conv2d", normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, macaron_style: bool = False, rel_pos_type: str = "legacy", pos_enc_layer_type: str = "rel_pos", selfattention_layer_type: str = "rel_selfattn", activation_type: str = "swish", use_cnn_module: bool = True, zero_triu: bool = False, cnn_module_kernel: int = 31, padding_idx: int = -1, interctc_layer_idx: List[int] = [], interctc_use_conditioning: bool = False, stochastic_depth_rate: Union[float, List[float]] = 0.0, ): assert check_argument_types() super().__init__() self._output_size = output_size if rel_pos_type == "legacy": if pos_enc_layer_type == "rel_pos": pos_enc_layer_type = "legacy_rel_pos" if selfattention_layer_type == "rel_selfattn": selfattention_layer_type = "legacy_rel_selfattn" elif rel_pos_type == "latest": assert selfattention_layer_type != "legacy_rel_selfattn" assert pos_enc_layer_type != "legacy_rel_pos" else: raise ValueError("unknown rel_pos_type: " + rel_pos_type) activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding elif pos_enc_layer_type == "legacy_rel_pos": assert selfattention_layer_type == "legacy_rel_selfattn" pos_enc_class = LegacyRelPositionalEncoding logging.warning( "Using legacy_rel_pos and it will be deprecated in the future." ) else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d2": self.embed = Conv2dSubsampling2( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate) ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") if selfattention_layer_type == "selfattn": encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) elif selfattention_layer_type == "legacy_rel_selfattn": assert pos_enc_layer_type == "legacy_rel_pos" encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) logging.warning( "Using legacy_rel_selfattn and it will be deprecated in the future." ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, zero_triu, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation) if isinstance(stochastic_depth_rate, float): stochastic_depth_rate = [stochastic_depth_rate] * num_blocks if len(stochastic_depth_rate) != num_blocks: raise ValueError( f"Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) " f"should be equal to num_blocks ({num_blocks})" ) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, stochastic_depth_rate[lnum], ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size) self.interctc_layer_idx = interctc_layer_idx if len(interctc_layer_idx) > 0: assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks self.interctc_use_conditioning = interctc_use_conditioning self.conditioning_layer = None
def __init__( self, idim, pred_into_type, into_type_num, reduce_character_embedding, attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=3, dropout_rate=0.2, positional_dropout_rate=0.1, attention_dropout_rate=0.0, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_conv_kernel_size=1, padding_idx=-1, elayers=None, eunits=None, ): """Construct an Encoder object.""" super(SentenceEncoder, self).__init__() self.conv_subsampling_factor = 1 self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) self.normalize_before = normalize_before positionwise_layer = PositionwiseFeedForward positionwise_layer_args = (attention_dim, linear_units, dropout_rate) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) # For reduction self.reduce_character_embedding = reduce_character_embedding self.query = None # For embedding reduction if reduce_character_embedding or pred_into_type: query = torch.nn.Parameter(torch.FloatTensor((attention_dim)), requires_grad=True) self.query = torch.nn.init.uniform_(query) # self.d_k = math.sqrt(eunits) self.K = torch.nn.Linear(attention_dim, attention_dim) # self.V = torch.nn.Linear(eunits, eunits) self.score_dropout = torch.nn.Dropout(p=dropout_rate) # For prediction self.pred_prj = None if pred_into_type: self.pred_prj = torch.nn.Linear(attention_dim, into_type_num)
def __init__( self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1, ): """Construct an Encoder object.""" super(Encoder, self).__init__() if input_layer == "custom": self.embed = EncoderConv2d(idim, attention_dim) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = (attention_dim, linear_units, dropout_rate) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") self.encoders = repeat( num_blocks, lambda: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: str = "conv2d", normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, macaron_style: bool = False, pos_enc_layer_type: str = "rel_pos", selfattention_layer_type: str = "rel_selfattn", activation_type: str = "swish", use_cnn_module: bool = True, cnn_module_kernel: int = 31, padding_idx: int = -1, ): assert check_argument_types() super().__init__() self._output_size = output_size activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") if selfattention_layer_type == "selfattn": encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size)
def __init__(self, idim, odim, args, ignore_id=-1, blank_id=0): """Construct an E2E object for transducer model.""" torch.nn.Module.__init__(self) if "transformer" in args.etype: if args.enc_block_arch is None: raise ValueError( "Transformer-based blocks in transducer mode should be" "defined individually in the YAML file." "See egs/vivos/asr1/conf/transducer/* for more info.") self.subsample = get_subsample(args, mode="asr", arch="transformer") # 2. use transformer to joint feature maps # transformer without positional encoding self.clayers = repeat( 2, lambda lnum: EncoderLayer( 16, MultiHeadedAttention(4, 16, 0.1), PositionwiseFeedForward(16, 2048, 0.1), dropout_rate=0.1, normalize_before=True, concat_after=False, ), ) self.conv = torch.nn.Sequential( torch.nn.Conv2d(1, 32, kernel_size=(3, 5), stride=(1, 2)), torch.nn.ReLU(), torch.nn.Conv2d(32, 32, kernel_size=(3, 7), stride=(2, 2)), torch.nn.ReLU()) self.encoder = Encoder( idim, args.enc_block_arch, input_layer=args.transformer_enc_input_layer, repeat_block=args.enc_block_repeat, self_attn_type=args.transformer_enc_self_attn_type, positional_encoding_type=args. transformer_enc_positional_encoding_type, positionwise_activation_type=args. transformer_enc_pw_activation_type, conv_mod_activation_type=args. transformer_enc_conv_mod_activation_type, ) encoder_out = self.encoder.enc_out args.eprojs = self.encoder.enc_out self.most_dom_list = args.enc_block_arch[:] else: self.subsample = get_subsample(args, mode="asr", arch="rnn-t") self.enc = encoder_for(args, idim, self.subsample) encoder_out = args.eprojs if "transformer" in args.dtype: if args.dec_block_arch is None: raise ValueError( "Transformer-based blocks in transducer mode should be" "defined individually in the YAML file." "See egs/vivos/asr1/conf/transducer/* for more info.") self.decoder = DecoderTT( odim, encoder_out, args.joint_dim, args.dec_block_arch, input_layer=args.transformer_dec_input_layer, repeat_block=args.dec_block_repeat, joint_activation_type=args.joint_activation_type, positionwise_activation_type=args. transformer_dec_pw_activation_type, dropout_rate_embed=args.dropout_rate_embed_decoder, ) if "transformer" in args.etype: self.most_dom_list += args.dec_block_arch[:] else: self.most_dom_list = args.dec_block_arch[:] else: if args.rnnt_mode == "rnnt-att": self.att = att_for(args) self.dec = DecoderRNNTAtt( args.eprojs, odim, args.dtype, args.dlayers, args.dunits, blank_id, self.att, args.dec_embed_dim, args.joint_dim, args.joint_activation_type, args.dropout_rate_decoder, args.dropout_rate_embed_decoder, ) else: self.dec = DecoderRNNT( args.eprojs, odim, args.dtype, args.dlayers, args.dunits, blank_id, args.dec_embed_dim, args.joint_dim, args.joint_activation_type, args.dropout_rate_decoder, args.dropout_rate_embed_decoder, ) if hasattr(self, "most_dom_list"): self.most_dom_dim = sorted( Counter(d["d_hidden"] for d in self.most_dom_list if "d_hidden" in d).most_common(), key=lambda x: x[0], reverse=True, )[0][0] self.etype = args.etype self.dtype = args.dtype self.rnnt_mode = args.rnnt_mode self.sos = odim - 1 self.eos = odim - 1 self.blank_id = blank_id self.ignore_id = ignore_id self.space = args.sym_space self.blank = args.sym_blank self.odim = odim self.reporter = Reporter() self.criterion = TransLoss(args.trans_type, self.blank_id) self.default_parameters(args) if args.report_cer or args.report_wer: from espnet.nets.e2e_asr_common import ErrorCalculatorTransducer if self.dtype == "transformer": decoder = self.decoder else: decoder = self.dec self.error_calculator = ErrorCalculatorTransducer( decoder, args.char_list, args.sym_space, args.sym_blank, args.report_cer, args.report_wer, ) else: self.error_calculator = None self.loss = None self.rnnlm = None
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: str = "conv2d", normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, macaron_style: bool = False, rel_pos_type: str = "legacy", pos_enc_layer_type: str = "abs_pos", selfattention_layer_type: str = "lf_selfattn", activation_type: str = "swish", use_cnn_module: bool = True, zero_triu: bool = False, cnn_module_kernel: int = 31, padding_idx: int = -1, interctc_layer_idx: List[int] = [], interctc_use_conditioning: bool = False, attention_windows: list = [100, 100, 100, 100, 100, 100], attention_dilation: list = [1, 1, 1, 1, 1, 1], attention_mode: str = "sliding_chunks", ): assert check_argument_types() super().__init__(input_size) self._output_size = output_size activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding else: raise ValueError("incorrect or unknown pos_enc_layer: " + pos_enc_layer_type + "Use abs_pos") if len(attention_dilation) != num_blocks: raise ValueError( "incorrect attention_dilation parameter of length" + str(len(attention_dilation)) + " does not match num_blocks" + str(num_blocks)) if len(attention_windows) != num_blocks: raise ValueError( "incorrect attention_windows parameter of length" + str(len(attention_windows)) + " does not match num_blocks" + str(num_blocks)) if attention_mode != "tvm" and max(attention_dilation) != 1: raise ValueError("incorrect attention mode for dilation: " + attention_mode + "Use attention_mode=tvm with Cuda Kernel") if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d2": self.embed = Conv2dSubsampling2( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") self.selfattention_layer_type = selfattention_layer_type if selfattention_layer_type == "lf_selfattn": assert pos_enc_layer_type == "abs_pos" from longformer.longformer import LongformerConfig from espnet.nets.pytorch_backend.transformer.longformer_attention import ( LongformerAttention, ) encoder_selfattn_layer = LongformerAttention config = LongformerConfig( attention_window=attention_windows, attention_dilation=attention_dilation, autoregressive=False, num_attention_heads=attention_heads, hidden_size=output_size, attention_probs_dropout_prob=dropout_rate, attention_mode=attention_mode, ) encoder_selfattn_layer_args = (config, ) else: raise ValueError("incompatible or unknown encoder_attn_layer: " + selfattention_layer_type + " Use lf_selfattn") convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation) self.encoders = repeat( num_blocks, lambda layer_id: EncoderLayer( output_size, encoder_selfattn_layer(*(encoder_selfattn_layer_args + (layer_id, ))), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size) self.interctc_layer_idx = interctc_layer_idx if len(interctc_layer_idx) > 0: assert 0 < min(interctc_layer_idx) and max( interctc_layer_idx) < num_blocks self.interctc_use_conditioning = interctc_use_conditioning self.conditioning_layer = None
def __init__(self, odim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, cross_operator=None, cross_shared=False, cross_weight_learnable=False, cross_weight=0.0): """Construct an Decoder object.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before cross_self_attn = None cross_src_attn = None if cross_operator: if 'src_' in cross_operator: # cross_src_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate) cross_src_attn = True if 'self_' in cross_operator: if cross_shared and cross_src_attn is not None: # cross_self_attn = cross_src_attn cross_self_attn = True # TODO: backward compatibility for shared self and source else: # cross_self_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate) cross_self_attn = True if 'concat' in cross_operator: cross_operator = 'concat' elif 'sum' in cross_operator: cross_operator = 'sum' else: raise NotImplementedError self.decoders = repeat( num_blocks, lambda: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, cross_self_attn=MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ) if cross_self_attn else None, cross_src_attn=MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ) if cross_src_attn else None, cross_operator=cross_operator, cross_shared=cross_shared, cross_weight_learnable=cross_weight_learnable, cross_weight=cross_weight)) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__( self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks_sd=4, num_blocks_rec=8, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1, num_spkrs=2, ): """Construct an Encoder object.""" super(EncoderMix, self).__init__( idim=idim, selfattention_layer_type="selfattn", attention_dim=attention_dim, attention_heads=attention_heads, linear_units=linear_units, num_blocks=num_blocks_rec, dropout_rate=dropout_rate, positional_dropout_rate=positional_dropout_rate, attention_dropout_rate=attention_dropout_rate, input_layer=input_layer, pos_enc_class=pos_enc_class, normalize_before=normalize_before, concat_after=concat_after, positionwise_layer_type=positionwise_layer_type, positionwise_conv_kernel_size=positionwise_conv_kernel_size, padding_idx=padding_idx, ) positionwise_layer, positionwise_layer_args = self.get_positionwise_layer( positionwise_layer_type, attention_dim, linear_units, dropout_rate, positionwise_conv_kernel_size, ) self.num_spkrs = num_spkrs self.encoders_sd = torch.nn.ModuleList([ repeat( num_blocks_sd, lambda lnum: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) for i in range(num_spkrs) ])
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: Optional[str] = "conv2d", pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 1, padding_idx: int = -1, ): assert check_argument_types() super().__init__() self._output_size = output_size if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(input_size, output_size, dropout_rate) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6(input_size, output_size, dropout_rate) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8(input_size, output_size, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate) ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( output_size, MultiHeadedAttention( attention_heads, output_size, attention_dropout_rate ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size)
def __init__( self, idim, attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length="11", conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, selfattention_layer_type="selfattn", padding_idx=-1, stochastic_depth_rate=0.0, intermediate_layers=None, ): """Construct an Encoder object.""" super(Encoder, self).__init__() self._register_load_state_dict_pre_hook(_pre_hook) self.conv_subsampling_factor = 1 if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) self.conv_subsampling_factor = 4 elif input_layer == "conv2d-scaled-pos-enc": self.embed = Conv2dSubsampling( idim, attention_dim, dropout_rate, pos_enc_class(attention_dim, positional_dropout_rate), ) self.conv_subsampling_factor = 4 elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6(idim, attention_dim, dropout_rate) self.conv_subsampling_factor = 6 elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8(idim, attention_dim, dropout_rate) self.conv_subsampling_factor = 8 elif input_layer == "vgg2l": self.embed = VGG2L(idim, attention_dim) self.conv_subsampling_factor = 4 elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before positionwise_layer, positionwise_layer_args = self.get_positionwise_layer( positionwise_layer_type, attention_dim, linear_units, dropout_rate, positionwise_conv_kernel_size, ) if selfattention_layer_type in [ "selfattn", "rel_selfattn", "legacy_rel_selfattn", ]: logging.info("encoder self-attention layer type = self-attention") encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = [( attention_heads, attention_dim, attention_dropout_rate, )] * num_blocks elif selfattention_layer_type == "lightconv": logging.info( "encoder self-attention layer type = lightweight convolution") encoder_selfattn_layer = LightweightConvolution encoder_selfattn_layer_args = [( conv_wshare, attention_dim, attention_dropout_rate, int(conv_kernel_length.split("_")[lnum]), False, conv_usebias, ) for lnum in range(num_blocks)] elif selfattention_layer_type == "lightconv2d": logging.info("encoder self-attention layer " "type = lightweight convolution 2-dimensional") encoder_selfattn_layer = LightweightConvolution2D encoder_selfattn_layer_args = [( conv_wshare, attention_dim, attention_dropout_rate, int(conv_kernel_length.split("_")[lnum]), False, conv_usebias, ) for lnum in range(num_blocks)] elif selfattention_layer_type == "dynamicconv": logging.info( "encoder self-attention layer type = dynamic convolution") encoder_selfattn_layer = DynamicConvolution encoder_selfattn_layer_args = [( conv_wshare, attention_dim, attention_dropout_rate, int(conv_kernel_length.split("_")[lnum]), False, conv_usebias, ) for lnum in range(num_blocks)] elif selfattention_layer_type == "dynamicconv2d": logging.info( "encoder self-attention layer type = dynamic convolution 2-dimensional" ) encoder_selfattn_layer = DynamicConvolution2D encoder_selfattn_layer_args = [( conv_wshare, attention_dim, attention_dropout_rate, int(conv_kernel_length.split("_")[lnum]), False, conv_usebias, ) for lnum in range(num_blocks)] else: raise NotImplementedError(selfattention_layer_type) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, encoder_selfattn_layer(*encoder_selfattn_layer_args[lnum]), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, stochastic_depth_rate * float(1 + lnum) / num_blocks, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) self.intermediate_layers = intermediate_layers
def __init__( self, idim, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1, ): """Construct an Encoder object.""" super(Encoder, self).__init__() self._register_load_state_dict_pre_hook(_pre_hook) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) elif input_layer == "conv2d-scaled-pos-enc": self.embed = Conv2dSubsampling( idim, attention_dim, dropout_rate, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6(idim, attention_dim, dropout_rate) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8(idim, attention_dim, dropout_rate) elif input_layer == "vgg2l": self.embed = VGG2L(idim, attention_dim) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before positionwise_layer, positionwise_layer_args = self.get_positionwise_layer( positionwise_layer_type, attention_dim, linear_units, dropout_rate, positionwise_conv_kernel_size, ) if selfattention_layer_type == "selfattn": logging.info("encoder self-attention layer type = self-attention") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, MultiHeadedAttention_wordscale( attention_heads, attention_dim, attention_dropout_rate ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv": logging.info( "encoder self-attention layer type = lightweight convolution") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, LightweightConvolution( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv2d": logging.info("encoder self-attention layer " "type = lightweight convolution 2-dimentional") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, LightweightConvolution2D( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv": logging.info( "encoder self-attention layer type = dynamic convolution") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, DynamicConvolution( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv2d": logging.info( "encoder self-attention layer type = dynamic convolution 2-dimentional" ) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, DynamicConvolution2D( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, languages, idim, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1, sim_adapter=False, shared_adapter=None, use_adapters=True, fusion_languages=None, ): super().__init__(idim, selfattention_layer_type, attention_dim, attention_heads, conv_wshare, conv_kernel_length, conv_usebias, linear_units, num_blocks, dropout_rate, positional_dropout_rate, attention_dropout_rate, input_layer, pos_enc_class, normalize_before, concat_after, positionwise_layer_type, positionwise_conv_kernel_size, padding_idx) positionwise_layer, positionwise_layer_args = self.get_positionwise_layer( positionwise_layer_type, attention_dim, linear_units, dropout_rate, positionwise_conv_kernel_size, ) if selfattention_layer_type == "selfattn": logging.info("encoder self-attention layer type = self-attention") self.encoders = repeat( num_blocks, lambda lnum: AdaptiveEncoderLayer( languages, attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, torch.nn.ModuleDict({ "_".join(sorted(fusion_languages)): SimAdapter(attention_dim, attention_dropout_rate, fusion_languages) }) if sim_adapter else None, shared_adapter, use_adapters, ), ) else: raise NotImplementedError( "Only support self-attention encoder layer")
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: Optional[str] = "conv2d", pos_enc_class=StreamPositionalEncoding, normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 1, padding_idx: int = -1, block_size: int = 40, hop_size: int = 16, look_ahead: int = 16, init_average: bool = True, ctx_pos_enc: bool = True, ): assert check_argument_types() super().__init__() self._output_size = output_size self.pos_enc = pos_enc_class(output_size, positional_dropout_rate) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), ) self.subsample = 1 elif input_layer == "conv2d": self.embed = Conv2dSubsamplingWOPosEnc(input_size, output_size, dropout_rate, kernels=[3, 3], strides=[2, 2]) self.subsample = 4 elif input_layer == "conv2d6": self.embed = Conv2dSubsamplingWOPosEnc(input_size, output_size, dropout_rate, kernels=[3, 5], strides=[2, 3]) self.subsample = 6 elif input_layer == "conv2d8": self.embed = Conv2dSubsamplingWOPosEnc( input_size, output_size, dropout_rate, kernels=[3, 3, 3], strides=[2, 2, 2], ) self.subsample = 8 elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), ) self.subsample = 1 elif input_layer is None: self.embed = None self.subsample = 1 else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") self.encoders = repeat( num_blocks, lambda lnum: ContextualBlockEncoderLayer( output_size, MultiHeadedAttention(attention_heads, output_size, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, num_blocks, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size) # for block processing self.block_size = block_size self.hop_size = hop_size self.look_ahead = look_ahead self.init_average = init_average self.ctx_pos_enc = ctx_pos_enc
def __init__( self, languages, odim_dict, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, sim_adapter=False, shared_adapter=False, use_adapters=True, fusion_languages=None, ): super().__init__(1, selfattention_layer_type, attention_dim, attention_heads, conv_wshare, conv_kernel_length, conv_usebias, linear_units, num_blocks, dropout_rate, positional_dropout_rate, self_attention_dropout_rate, src_attention_dropout_rate, input_layer, use_output_layer, pos_enc_class, normalize_before, concat_after) if input_layer == "embed": self.embed = torch.nn.ModuleDict() for lang in odim_dict.keys(): self.embed[lang] = torch.nn.Sequential( torch.nn.Embedding(odim_dict[lang], attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise NotImplementedError("only support embed embedding layer") assert self_attention_dropout_rate == src_attention_dropout_rate if selfattention_layer_type == "selfattn": logging.info("decoder self-attention layer type = self-attention") self.decoders = repeat( num_blocks, lambda lnum: AdaptiveDecoderLayer( languages, attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, torch.nn.ModuleDict({ "_".join(sorted(fusion_languages)): SimAdapter(attention_dim, self_attention_dropout_rate, fusion_languages) }) if sim_adapter else None, shared_adapter, use_adapters, ), ) else: raise NotImplementedError( "Only support self-attention decoder layer") if use_output_layer: self.output_layer = torch.nn.ModuleDict() for lang in odim_dict.keys(): self.output_layer[lang] = torch.nn.Linear( attention_dim, odim_dict[lang]) else: self.output_layer = None
def __init__(self, idim, center_len=8, left_len=0, hop_len=0, right_len=0, abs_pos=1, rel_pos=0, use_mem=1, att_type="mta", attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): super(Encoder, self).__init__() self.idim = idim self.center_len = center_len self.use_mem = use_mem != 0 self.left_len = left_len if self.use_mem: self.mem_len = left_len else: self.mem_len = 0 self.hop_len = hop_len self.right_len = right_len self.abs_pos = abs_pos != 0 self.rel_pos = rel_pos != 0 self.attention_dim = attention_dim self.attention_heads = attention_heads self.linear_units = linear_units self.dropout_rate = dropout_rate self.input_layer = input_layer self.normalize_before = normalize_before self.concat_after = concat_after self.positional_dropout_rate = positional_dropout_rate self.pos_enc_class = pos_enc_class self._generateInputLayer() self.encoders = repeat( num_blocks, lambda: EncoderLayer(n_head=attention_heads, d_model=attention_dim, d_head=attention_dim // attention_heads, att_type=att_type, ext_len=hop_len // 4, mem_len=self.mem_len // 4, tgt_len=center_len, future_len=right_len, rel_pos=rel_pos, dropout=dropout_rate, dropatt=attention_dropout_rate, pre_lnorm=normalize_before, pos_ff=PositionwiseFeedForward( attention_dim, linear_units, dropout_rate))) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: Optional[str] = None, normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 1, pos_enc_layer_type: str = "rel_pos", selfattention_layer_type: str = "rel_selfattn", activation_type='relu', padding_idx: int = -1, ): assert check_argument_types() super().__init__() self._output_size = output_size # todo: my change, from conformer/encoder_layer.py if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) # input layer if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(input_size, output_size, dropout_rate) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6(input_size, output_size, dropout_rate) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8(input_size, output_size, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate) ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before # position-wise layer activation = get_activation(activation_type) if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") # encoders type and args if selfattention_layer_type == "selfattn": encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) # encoders self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size)