def build_conformer_block( block: Dict[str, Any], self_attn_class: str, pw_layer_type: str, pw_activation_type: str, conv_mod_activation_type: str, ) -> ConformerEncoderLayer: """Build function for conformer block. Args: block: Conformer block parameters. self_attn_type: Self-attention module type. pw_layer_type: Positionwise layer type. pw_activation_type: Positionwise activation type. conv_mod_activation_type: Convolutional module activation type. Returns: : Function to create conformer (encoder) block. """ d_hidden = block["d_hidden"] d_ff = block["d_ff"] dropout_rate = block.get("dropout-rate", 0.0) pos_dropout_rate = block.get("pos-dropout-rate", 0.0) att_dropout_rate = block.get("att-dropout-rate", 0.0) if pw_layer_type == "linear": pw_layer = PositionwiseFeedForward( d_hidden, d_ff, pos_dropout_rate, get_activation(pw_activation_type), ) else: raise NotImplementedError("Conformer block only supports linear yet.") macaron_net = (PositionwiseFeedForward( d_hidden, d_ff, pos_dropout_rate, get_activation(pw_activation_type), ) if block["macaron_style"] else None) conv_mod = (ConvolutionModule( d_hidden, block["conv_mod_kernel"], get_activation(conv_mod_activation_type), ) if block["use_conv_mod"] else None) return lambda: ConformerEncoderLayer( d_hidden, self_attn_class(block["heads"], d_hidden, att_dropout_rate), pw_layer, macaron_net, conv_mod, dropout_rate, )
def build_conformer_block( block_arch, self_attn_class, pos_enc_class, pw_layer_type, pw_activation_type, conv_mod_activation_type, ): """Build function for conformer block. Args: block_arch (dict): conformer block parameters self_attn_type (str): self-attention module type pos_enc_class (str): positional encoding class pw_layer_type (str): positionwise layer type pw_activation_type (str): positionwise activation type conv_mod_activation_type (str): convolutional module activation type Returns: (function): function to create conformer block """ d_hidden = block_arch["d_hidden"] d_ff = block_arch["d_ff"] heads = block_arch["heads"] macaron_style = block_arch["macaron_style"] use_conv_mod = block_arch["use_conv_mod"] dropout_rate = block_arch[ "dropout-rate"] if "dropout-rate" in block_arch else 0.0 pos_dropout_rate = (block_arch["pos-dropout-rate"] if "pos-dropout-rate" in block_arch else 0.0) att_dropout_rate = (block_arch["att-dropout-rate"] if "att-dropout-rate" in block_arch else 0.0) if pw_layer_type == "linear": pw_layer = PositionwiseFeedForward pw_activation = get_activation(pw_activation_type) pw_layer_args = (d_hidden, d_ff, pos_dropout_rate, pw_activation) else: raise NotImplementedError("Conformer block only supports linear yet.") if use_conv_mod: conv_layer = ConvolutionModule conv_activation = get_activation(conv_mod_activation_type) conv_layers_args = (d_hidden, block_arch["conv_mod_kernel"], conv_activation) return lambda: ConformerEncoderLayer( d_hidden, self_attn_class(heads, d_hidden, att_dropout_rate), pw_layer(*pw_layer_args), pw_layer(*pw_layer_args) if macaron_style else None, conv_layer(*conv_layers_args) if use_conv_mod else None, dropout_rate, )
def __init__( self, size: int, kernel_size: int, dropout_rate: float, use_linear_after_conv: bool, gate_activation: str, ): super().__init__() n_channels = size // 2 # split input channels self.norm = LayerNorm(n_channels) self.conv = torch.nn.Conv1d( n_channels, n_channels, kernel_size, 1, (kernel_size - 1) // 2, groups=n_channels, ) if use_linear_after_conv: self.linear = torch.nn.Linear(n_channels, n_channels) else: self.linear = None if gate_activation == "identity": self.act = torch.nn.Identity() else: self.act = get_activation(gate_activation) self.dropout = torch.nn.Dropout(dropout_rate)
def __init__( self, eprojs, odim, dtype, dlayers, dunits, blank, att, embed_dim, joint_dim, joint_activation_type="tanh", dropout=0.0, dropout_embed=0.0, ): """Transducer with attention initializer.""" super(DecoderRNNTAtt, self).__init__() self.embed = torch.nn.Embedding(odim, embed_dim, padding_idx=blank) self.dropout_emb = torch.nn.Dropout(p=dropout_embed) if dtype == "lstm": dec_net = torch.nn.LSTMCell else: dec_net = torch.nn.GRUCell self.decoder = torch.nn.ModuleList( [dec_net((embed_dim + eprojs), dunits)]) self.dropout_dec = torch.nn.ModuleList([torch.nn.Dropout(p=dropout)]) for _ in range(1, dlayers): self.decoder += [dec_net(dunits, dunits)] self.dropout_dec += [torch.nn.Dropout(p=dropout)] self.lin_enc = torch.nn.Linear(eprojs, joint_dim) self.lin_dec = torch.nn.Linear(dunits, joint_dim, bias=False) self.lin_out = torch.nn.Linear(joint_dim, odim) self.joint_activation = get_activation(joint_activation_type) self.att = att self.dtype = dtype self.dlayers = dlayers self.dunits = dunits self.embed_dim = embed_dim self.joint_dim = joint_dim self.odim = odim self.ignore_id = -1 self.blank = blank
def __init__( self, vocab_size: int, encoder_output_size: int, hidden_size: int, joint_space_size: int, joint_activation_type: int, ): """Joint network initializer.""" super().__init__() self.lin_enc = torch.nn.Linear(encoder_output_size, joint_space_size) self.lin_dec = torch.nn.Linear(hidden_size, joint_space_size, bias=False) self.lin_out = torch.nn.Linear(joint_space_size, vocab_size) self.joint_activation = get_activation(joint_activation_type)
def __init__( self, joint_output_size: int, encoder_output_size: int, decoder_output_size: int, joint_space_size: int = 256, joint_activation_type: str = "tanh", ): """Joint network initializer.""" super().__init__() self.lin_enc = torch.nn.Linear(encoder_output_size, joint_space_size) self.lin_dec = torch.nn.Linear(decoder_output_size, joint_space_size) self.lin_out = torch.nn.Linear(joint_space_size, joint_output_size) self.joint_activation = get_activation(joint_activation_type)
def build_transformer_block( net_part: str, block_arch: Dict, pw_layer_type: str, pw_activation_type: str ) -> Union[EncoderLayer, TransformerDecoderLayer]: """Build function for transformer block. Args: net_part: Network part, either 'encoder' or 'decoder'. block_arch: Transformer block parameters. pw_layer_type: Positionwise layer type. pw_activation_type: Positionwise activation type. Returns: : Function to create transformer (encoder or decoder) block. """ d_hidden = block_arch["d_hidden"] d_ff = block_arch["d_ff"] heads = block_arch["heads"] dropout_rate = block_arch["dropout-rate"] if "dropout-rate" in block_arch else 0.0 pos_dropout_rate = ( block_arch["pos-dropout-rate"] if "pos-dropout-rate" in block_arch else 0.0 ) att_dropout_rate = ( block_arch["att-dropout-rate"] if "att-dropout-rate" in block_arch else 0.0 ) if pw_layer_type == "linear": pw_layer = PositionwiseFeedForward pw_activation = get_activation(pw_activation_type) pw_layer_args = (d_hidden, d_ff, pos_dropout_rate, pw_activation) else: raise NotImplementedError("Transformer block only supports linear yet.") if net_part == "encoder": transformer_layer_class = EncoderLayer elif net_part == "decoder": transformer_layer_class = TransformerDecoderLayer return lambda: transformer_layer_class( d_hidden, MultiHeadedAttention(heads, d_hidden, att_dropout_rate), pw_layer(*pw_layer_args), dropout_rate, )
def build_transformer_block(net_part, block_arch, pw_layer_type, pw_activation_type): """Build function for transformer block. Args: net_part (str): either 'encoder' or 'decoder' block_arch (dict): transformer block parameters pw_layer_type (str): positionwise layer type pw_activation_type (str): positionwise activation type Returns: (function): function to create transformer block """ d_hidden = block_arch["d_hidden"] d_ff = block_arch["d_ff"] heads = block_arch["heads"] dropout_rate = block_arch[ "dropout-rate"] if "dropout-rate" in block_arch else 0.0 pos_dropout_rate = (block_arch["pos-dropout-rate"] if "pos-dropout-rate" in block_arch else 0.0) att_dropout_rate = (block_arch["att-dropout-rate"] if "att-dropout-rate" in block_arch else 0.0) if pw_layer_type == "linear": pw_layer = PositionwiseFeedForward pw_activation = get_activation(pw_activation_type) pw_layer_args = (d_hidden, d_ff, pos_dropout_rate, pw_activation) else: raise NotImplementedError( "Transformer block only supports linear yet.") if net_part == "encoder": transformer_layer_class = EncoderLayer elif net_part == "decoder": transformer_layer_class = DecoderLayer return lambda: transformer_layer_class( d_hidden, MultiHeadedAttention(heads, d_hidden, att_dropout_rate), pw_layer(*pw_layer_args), dropout_rate, )
def build_transformer_block( net_part: str, block: Dict[str, Any], pw_layer_type: str, pw_activation_type: str, ) -> Union[EncoderLayer, TransformerDecoderLayer]: """Build function for transformer block. Args: net_part: Network part, either 'encoder' or 'decoder'. block: Transformer block parameters. pw_layer_type: Positionwise layer type. pw_activation_type: Positionwise activation type. Returns: : Function to create transformer (encoder or decoder) block. """ d_hidden = block["d_hidden"] dropout_rate = block.get("dropout-rate", 0.0) pos_dropout_rate = block.get("pos-dropout-rate", 0.0) att_dropout_rate = block.get("att-dropout-rate", 0.0) if pw_layer_type != "linear": raise NotImplementedError( "Transformer block only supports linear pointwise layer.") if net_part == "encoder": transformer_layer_class = EncoderLayer elif net_part == "decoder": transformer_layer_class = TransformerDecoderLayer return lambda: transformer_layer_class( d_hidden, MultiHeadedAttention(block["heads"], d_hidden, att_dropout_rate), PositionwiseFeedForward( d_hidden, block["d_ff"], pos_dropout_rate, get_activation(pw_activation_type), ), dropout_rate, )
def __init__( self, odim, edim, jdim, dec_arch, input_layer="embed", repeat_block=0, joint_activation_type="tanh", positional_encoding_type="abs_pos", positionwise_layer_type="linear", positionwise_activation_type="relu", dropout_rate_embed=0.0, blank=0, ): """Construct a Decoder object for transformer-transducer models.""" torch.nn.Module.__init__(self) self.embed, self.decoders, ddim = build_blocks( "decoder", odim, input_layer, dec_arch, repeat_block=repeat_block, positional_encoding_type=positional_encoding_type, positionwise_layer_type=positionwise_layer_type, positionwise_activation_type=positionwise_activation_type, dropout_rate_embed=dropout_rate_embed, padding_idx=blank, ) self.after_norm = LayerNorm(ddim) self.lin_enc = torch.nn.Linear(edim, jdim) self.lin_dec = torch.nn.Linear(ddim, jdim, bias=False) self.lin_out = torch.nn.Linear(jdim, odim) self.joint_activation = get_activation(joint_activation_type) self.dunits = ddim self.odim = odim self.blank = blank
def __init__( self, rnn_type, input_size, att_heads, hidden_size, dropout=0.0, activation="relu", bidirectional=True, norm="gLN", ): super().__init__() rnn_type = rnn_type.upper() assert rnn_type in [ "RNN", "LSTM", "GRU", ], f"Only support 'RNN', 'LSTM' and 'GRU', current type: {rnn_type}" self.rnn_type = rnn_type self.att_heads = att_heads self.self_attn = nn.MultiheadAttention(input_size, att_heads, dropout=dropout) self.dropout = nn.Dropout(p=dropout) self.norm_attn = choose_norm(norm, input_size) self.rnn = getattr(nn, rnn_type)( input_size, hidden_size, 1, batch_first=True, bidirectional=bidirectional, ) activation = get_activation(activation) hdim = 2 * hidden_size if bidirectional else hidden_size self.feed_forward = nn.Sequential( activation, nn.Dropout(p=dropout), nn.Linear(hdim, input_size) ) self.norm_ff = choose_norm(norm, input_size)
def __init__( self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, macaron_style=False, pos_enc_layer_type="abs_pos", selfattention_layer_type="selfattn", activation_type="swish", use_cnn_module=False, cnn_module_kernel=31, padding_idx=-1, ): """Construct an Encoder object.""" super(Encoder, self).__init__() activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( idim, attention_dim, dropout_rate, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "vgg2l": self.embed = VGG2L(idim, attention_dim) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( attention_dim, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") if selfattention_layer_type == "selfattn": logging.info("encoder self-attention layer type = self-attention") encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, attention_dim, attention_dropout_rate, ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, attention_dim, attention_dropout_rate, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) convolution_layer = ConvolutionModule convolution_layer_args = (attention_dim, cnn_module_kernel, activation) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: str = "conv2d", normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, macaron_style: bool = False, rel_pos_type: str = "legacy", pos_enc_layer_type: str = "abs_pos", selfattention_layer_type: str = "lf_selfattn", activation_type: str = "swish", use_cnn_module: bool = True, zero_triu: bool = False, cnn_module_kernel: int = 31, padding_idx: int = -1, interctc_layer_idx: List[int] = [], interctc_use_conditioning: bool = False, attention_windows: list = [100, 100, 100, 100, 100, 100], attention_dilation: list = [1, 1, 1, 1, 1, 1], attention_mode: str = "sliding_chunks", ): assert check_argument_types() super().__init__(input_size) self._output_size = output_size activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding else: raise ValueError("incorrect or unknown pos_enc_layer: " + pos_enc_layer_type + "Use abs_pos") if len(attention_dilation) != num_blocks: raise ValueError( "incorrect attention_dilation parameter of length" + str(len(attention_dilation)) + " does not match num_blocks" + str(num_blocks)) if len(attention_windows) != num_blocks: raise ValueError( "incorrect attention_windows parameter of length" + str(len(attention_windows)) + " does not match num_blocks" + str(num_blocks)) if attention_mode != "tvm" and max(attention_dilation) != 1: raise ValueError("incorrect attention mode for dilation: " + attention_mode + "Use attention_mode=tvm with Cuda Kernel") if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d2": self.embed = Conv2dSubsampling2( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") self.selfattention_layer_type = selfattention_layer_type if selfattention_layer_type == "lf_selfattn": assert pos_enc_layer_type == "abs_pos" from longformer.longformer import LongformerConfig from espnet.nets.pytorch_backend.transformer.longformer_attention import ( LongformerAttention, ) encoder_selfattn_layer = LongformerAttention config = LongformerConfig( attention_window=attention_windows, attention_dilation=attention_dilation, autoregressive=False, num_attention_heads=attention_heads, hidden_size=output_size, attention_probs_dropout_prob=dropout_rate, attention_mode=attention_mode, ) encoder_selfattn_layer_args = (config, ) else: raise ValueError("incompatible or unknown encoder_attn_layer: " + selfattention_layer_type + " Use lf_selfattn") convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation) self.encoders = repeat( num_blocks, lambda layer_id: EncoderLayer( output_size, encoder_selfattn_layer(*(encoder_selfattn_layer_args + (layer_id, ))), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size) self.interctc_layer_idx = interctc_layer_idx if len(interctc_layer_idx) > 0: assert 0 < min(interctc_layer_idx) and max( interctc_layer_idx) < num_blocks self.interctc_use_conditioning = interctc_use_conditioning self.conditioning_layer = None
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: str = "conv2d", normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, macaron_style: bool = False, rel_pos_type: str = "legacy", pos_enc_layer_type: str = "rel_pos", selfattention_layer_type: str = "rel_selfattn", activation_type: str = "swish", use_cnn_module: bool = True, zero_triu: bool = False, cnn_module_kernel: int = 31, padding_idx: int = -1, interctc_layer_idx: List[int] = [], interctc_use_conditioning: bool = False, stochastic_depth_rate: Union[float, List[float]] = 0.0, ): assert check_argument_types() super().__init__() self._output_size = output_size if rel_pos_type == "legacy": if pos_enc_layer_type == "rel_pos": pos_enc_layer_type = "legacy_rel_pos" if selfattention_layer_type == "rel_selfattn": selfattention_layer_type = "legacy_rel_selfattn" elif rel_pos_type == "latest": assert selfattention_layer_type != "legacy_rel_selfattn" assert pos_enc_layer_type != "legacy_rel_pos" else: raise ValueError("unknown rel_pos_type: " + rel_pos_type) activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding elif pos_enc_layer_type == "legacy_rel_pos": assert selfattention_layer_type == "legacy_rel_selfattn" pos_enc_class = LegacyRelPositionalEncoding logging.warning( "Using legacy_rel_pos and it will be deprecated in the future." ) else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d2": self.embed = Conv2dSubsampling2( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate) ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") if selfattention_layer_type == "selfattn": encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) elif selfattention_layer_type == "legacy_rel_selfattn": assert pos_enc_layer_type == "legacy_rel_pos" encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) logging.warning( "Using legacy_rel_selfattn and it will be deprecated in the future." ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, zero_triu, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation) if isinstance(stochastic_depth_rate, float): stochastic_depth_rate = [stochastic_depth_rate] * num_blocks if len(stochastic_depth_rate) != num_blocks: raise ValueError( f"Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) " f"should be equal to num_blocks ({num_blocks})" ) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, stochastic_depth_rate[lnum], ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size) self.interctc_layer_idx = interctc_layer_idx if len(interctc_layer_idx) > 0: assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks self.interctc_use_conditioning = interctc_use_conditioning self.conditioning_layer = None
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: Optional[str] = "conv2d", normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, macaron_style: bool = False, pos_enc_class=StreamPositionalEncoding, selfattention_layer_type: str = "rel_selfattn", activation_type: str = "swish", use_cnn_module: bool = True, cnn_module_kernel: int = 31, padding_idx: int = -1, block_size: int = 40, hop_size: int = 16, look_ahead: int = 16, init_average: bool = True, ctx_pos_enc: bool = True, ): assert check_argument_types() super().__init__() self._output_size = output_size self.pos_enc = pos_enc_class(output_size, positional_dropout_rate) activation = get_activation(activation_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), ) self.subsample = 1 elif input_layer == "conv2d": self.embed = Conv2dSubsamplingWOPosEnc(input_size, output_size, dropout_rate, kernels=[3, 3], strides=[2, 2]) self.subsample = 4 elif input_layer == "conv2d6": self.embed = Conv2dSubsamplingWOPosEnc(input_size, output_size, dropout_rate, kernels=[3, 5], strides=[2, 3]) self.subsample = 6 elif input_layer == "conv2d8": self.embed = Conv2dSubsamplingWOPosEnc( input_size, output_size, dropout_rate, kernels=[3, 3, 3], strides=[2, 2, 2], ) self.subsample = 8 elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), ) self.subsample = 1 elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(output_size, positional_dropout_rate), ) self.subsample = 1 elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate)) self.subsample = 1 else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation) self.encoders = repeat( num_blocks, lambda lnum: ContextualBlockEncoderLayer( output_size, MultiHeadedAttention(attention_heads, output_size, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, num_blocks, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size) # for block processing self.block_size = block_size self.hop_size = hop_size self.look_ahead = look_ahead self.init_average = init_average self.ctx_pos_enc = ctx_pos_enc
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: str = "conv2d", normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, macaron_style: bool = False, pos_enc_layer_type: str = "rel_pos", selfattention_layer_type: str = "rel_selfattn", activation_type: str = "swish", use_cnn_module: bool = True, cnn_module_kernel: int = 31, padding_idx: int = -1, ): assert check_argument_types() super().__init__() self._output_size = output_size activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") if selfattention_layer_type == "selfattn": encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size)
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: Optional[str] = None, normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 1, pos_enc_layer_type: str = "rel_pos", selfattention_layer_type: str = "rel_selfattn", activation_type='relu', padding_idx: int = -1, ): assert check_argument_types() super().__init__() self._output_size = output_size # todo: my change, from conformer/encoder_layer.py if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) # input layer if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(input_size, output_size, dropout_rate) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6(input_size, output_size, dropout_rate) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8(input_size, output_size, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate) ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before # position-wise layer activation = get_activation(activation_type) if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") # encoders type and args if selfattention_layer_type == "selfattn": encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) # encoders self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size)