def __init__(self, voca_size: int, enc_output_size: int, pred_output_size: int, join_dim: int, prejoin_linear: bool = True, postjoin_linear: bool = False, joint_mode: str = 'add', activation: str = "tanh"): assert check_argument_types() # TODO(Mddct): concat in future assert joint_mode in ['add'] super().__init__() self.activatoin = get_activation(activation) self.prejoin_linear = prejoin_linear self.postjoin_linear = postjoin_linear self.joint_mode = joint_mode if not self.prejoin_linear and not self.postjoin_linear: assert enc_output_size == pred_output_size == join_dim # torchscript compatibility self.enc_ffn: Optional[nn.Linear] = None self.pred_ffn: Optional[nn.Linear] = None if self.prejoin_linear: self.enc_ffn = nn.Linear(enc_output_size, join_dim) self.pred_ffn = nn.Linear(pred_output_size, join_dim) # torchscript compatibility self.post_ffn: Optional[nn.Linear] = None if self.postjoin_linear: self.post_ffn = nn.Linear(enc_output_size, join_dim) self.ffn_out = nn.Linear(join_dim, voca_size)
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: str = "conv2d", pos_enc_layer_type: str = "rel_pos", normalize_before: bool = True, concat_after: bool = False, static_chunk_size: int = 0, use_dynamic_chunk: bool = False, global_cmvn: torch.nn.Module = None, use_dynamic_left_chunk: bool = False, positionwise_conv_kernel_size: int = 1, macaron_style: bool = True, selfattention_layer_type: str = "rel_selfattn", activation_type: str = "swish", use_cnn_module: bool = True, cnn_module_kernel: int = 15, causal: bool = False, cnn_module_norm: str = "batch_norm", ): """Construct ConformerEncoder Args: input_size to use_dynamic_chunk, see in BaseEncoder positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. macaron_style (bool): Whether to use macaron style for positionwise layer. selfattention_layer_type (str): Encoder attention layer type, the parameter has no effect now, it's just for configure compatibility. activation_type (str): Encoder activation function type. use_cnn_module (bool): Whether to use convolution module. cnn_module_kernel (int): Kernel size of convolution module. causal (bool): whether to use causal convolution or not. """ assert check_argument_types() super().__init__(input_size, output_size, attention_heads, linear_units, num_blocks, dropout_rate, positional_dropout_rate, attention_dropout_rate, input_layer, pos_enc_layer_type, normalize_before, concat_after, static_chunk_size, use_dynamic_chunk, global_cmvn, use_dynamic_left_chunk) activation = get_activation(activation_type) # self-attention module definition if pos_enc_layer_type == "no_pos": encoder_selfattn_layer = MultiHeadedAttention else: encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) # feed-forward module definition positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) # convolution module definition convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation, cnn_module_norm, causal) self.encoders = torch.nn.ModuleList([ ConformerEncoderLayer( output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer( *positionwise_layer_args) if macaron_style else None, convolution_layer( *convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, ) for _ in range(num_blocks) ])
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: Optional[str] = "conv2d", normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: Optional[str] = "linear", positionwise_conv_kernel_size: int = 1, macaron_style: bool = True, pos_enc_layer_type: Optional[str] = "rel_pos", selfattention_layer_type: Optional[str] = "rel_selfattn", activation_type: Optional[str] = "swish", use_cnn_module: bool = True, cnn_module_kernel: int = 15, causal: bool = False, static_chunk_size: int = 0, use_dynamic_chunk: bool = False, ): """Construct an Encoder object.""" assert check_argument_types() super().__init__() self._output_size = output_size activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) if input_layer == "linear": self.embed = InputLinear( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = RelConv2dSubsampling( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = RelConv2dSubsampling6( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d8": self.embed = RelConv2dSubsampling8( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before # self-attention module definition if selfattention_layer_type == "selfattn": logging.info("encoder self-attention layer type = self-attention") encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) # feed-forward module definition if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") # convolution module definition convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation, causal) self.encoders = torch.nn.ModuleList([ ConformerEncoderLayer( output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer( *positionwise_layer_args) if macaron_style else None, convolution_layer( *convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, ) for _ in range(num_blocks) ]) if self.normalize_before: self.after_norm = torch.nn.LayerNorm(output_size) self.static_chunk_size = static_chunk_size self.use_dynamic_chunk = use_dynamic_chunk