def __init__( self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, macaron_style=False, pos_enc_layer_type="abs_pos", selfattention_layer_type="hopfield", activation_type="swish", use_cnn_module=False, cnn_module_kernel=31, padding_idx=-1, ): """Construct an Encoder object.""" super(Encoder, self).__init__() activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( idim, attention_dim, dropout_rate, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "vgg2l": self.embed = VGG2L(idim, attention_dim) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( attention_dim, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") if selfattention_layer_type == "selfattn": logging.info("encoder self-attention layer type = self-attention") encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, attention_dim, attention_dropout_rate, ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, attention_dim, attention_dropout_rate, ) elif selfattention_layer_type == "hopfield": logging.info("encoder self-attention layer type = hopfield layer") self.encoders = repeat( num_blocks, lambda lnum: HopfieldEncoderLayer( Hopfield(attention_dim, num_heads=attention_heads, dropout=attention_dropout_rate)), ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) convolution_layer = ConvolutionModule convolution_layer_args = (attention_dim, cnn_module_kernel, activation) if selfattention_layer_type != "hopfield": self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: str = "conv2d", normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, macaron_style: bool = False, rel_pos_type: str = "legacy", pos_enc_layer_type: str = "rel_pos", selfattention_layer_type: str = "rel_selfattn", activation_type: str = "swish", use_cnn_module: bool = True, zero_triu: bool = False, cnn_module_kernel: int = 31, padding_idx: int = -1, interctc_layer_idx: List[int] = [], interctc_use_conditioning: bool = False, stochastic_depth_rate: Union[float, List[float]] = 0.0, ): assert check_argument_types() super().__init__() self._output_size = output_size if rel_pos_type == "legacy": if pos_enc_layer_type == "rel_pos": pos_enc_layer_type = "legacy_rel_pos" if selfattention_layer_type == "rel_selfattn": selfattention_layer_type = "legacy_rel_selfattn" elif rel_pos_type == "latest": assert selfattention_layer_type != "legacy_rel_selfattn" assert pos_enc_layer_type != "legacy_rel_pos" else: raise ValueError("unknown rel_pos_type: " + rel_pos_type) activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding elif pos_enc_layer_type == "legacy_rel_pos": assert selfattention_layer_type == "legacy_rel_selfattn" pos_enc_class = LegacyRelPositionalEncoding logging.warning( "Using legacy_rel_pos and it will be deprecated in the future." ) else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d2": self.embed = Conv2dSubsampling2( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") if selfattention_layer_type == "selfattn": encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) elif selfattention_layer_type == "legacy_rel_selfattn": assert pos_enc_layer_type == "legacy_rel_pos" encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) logging.warning( "Using legacy_rel_selfattn and it will be deprecated in the future." ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, zero_triu, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation) if isinstance(stochastic_depth_rate, float): stochastic_depth_rate = [stochastic_depth_rate] * num_blocks if len(stochastic_depth_rate) != num_blocks: raise ValueError( f"Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) " f"should be equal to num_blocks ({num_blocks})") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, stochastic_depth_rate[lnum], ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size) self.interctc_layer_idx = interctc_layer_idx if len(interctc_layer_idx) > 0: assert 0 < min(interctc_layer_idx) and max( interctc_layer_idx) < num_blocks self.interctc_use_conditioning = interctc_use_conditioning self.conditioning_layer = None
def __init__( self, input_size: int, output_size: int = 256, use_attn: bool = True, attention_heads: int = 4, attention_layer_type: str = "rel_selfattn", pos_enc_layer_type: str = "rel_pos", rel_pos_type: str = "latest", use_cgmlp: bool = True, cgmlp_linear_units: int = 2048, cgmlp_conv_kernel: int = 31, use_linear_after_conv: bool = False, gate_activation: str = "identity", merge_method: str = "concat", cgmlp_weight: Union[float, List[float]] = 0.5, attn_branch_drop_rate: Union[float, List[float]] = 0.0, num_blocks: int = 12, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: Optional[str] = "conv2d", zero_triu: bool = False, padding_idx: int = -1, stochastic_depth_rate: Union[float, List[float]] = 0.0, ): assert check_argument_types() super().__init__() self._output_size = output_size if rel_pos_type == "legacy": if pos_enc_layer_type == "rel_pos": pos_enc_layer_type = "legacy_rel_pos" if attention_layer_type == "rel_selfattn": attention_layer_type = "legacy_rel_selfattn" elif rel_pos_type == "latest": assert attention_layer_type != "legacy_rel_selfattn" assert pos_enc_layer_type != "legacy_rel_pos" else: raise ValueError("unknown rel_pos_type: " + rel_pos_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert attention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding elif pos_enc_layer_type == "legacy_rel_pos": assert attention_layer_type == "legacy_rel_selfattn" pos_enc_class = LegacyRelPositionalEncoding logging.warning( "Using legacy_rel_pos and it will be deprecated in the future." ) else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d2": self.embed = Conv2dSubsampling2( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: if input_size == output_size: self.embed = None else: self.embed = torch.nn.Linear(input_size, output_size) else: raise ValueError("unknown input_layer: " + input_layer) if attention_layer_type == "selfattn": encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) elif attention_layer_type == "legacy_rel_selfattn": assert pos_enc_layer_type == "legacy_rel_pos" encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) logging.warning( "Using legacy_rel_selfattn and it will be deprecated in the future." ) elif attention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, zero_triu, ) elif attention_layer_type == "fast_selfattn": assert pos_enc_layer_type in ["abs_pos", "scaled_abs_pos"] encoder_selfattn_layer = FastSelfAttention encoder_selfattn_layer_args = ( output_size, attention_heads, attention_dropout_rate, ) else: raise ValueError("unknown encoder_attn_layer: " + attention_layer_type) cgmlp_layer = ConvolutionalGatingMLP cgmlp_layer_args = ( output_size, cgmlp_linear_units, cgmlp_conv_kernel, dropout_rate, use_linear_after_conv, gate_activation, ) if isinstance(stochastic_depth_rate, float): stochastic_depth_rate = [stochastic_depth_rate] * num_blocks if len(stochastic_depth_rate) != num_blocks: raise ValueError( f"Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) " f"should be equal to num_blocks ({num_blocks})") if isinstance(cgmlp_weight, float): cgmlp_weight = [cgmlp_weight] * num_blocks if len(cgmlp_weight) != num_blocks: raise ValueError( f"Length of cgmlp_weight ({len(cgmlp_weight)}) should be equal to " f"num_blocks ({num_blocks})") if isinstance(attn_branch_drop_rate, float): attn_branch_drop_rate = [attn_branch_drop_rate] * num_blocks if len(attn_branch_drop_rate) != num_blocks: raise ValueError( f"Length of attn_branch_drop_rate ({len(attn_branch_drop_rate)}) " f"should be equal to num_blocks ({num_blocks})") self.encoders = repeat( num_blocks, lambda lnum: BranchformerEncoderLayer( output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args) if use_attn else None, cgmlp_layer(*cgmlp_layer_args) if use_cgmlp else None, dropout_rate, merge_method, cgmlp_weight[lnum], attn_branch_drop_rate[lnum], stochastic_depth_rate[lnum], ), ) self.after_norm = LayerNorm(output_size)
def build_input_layer( input_layer, idim, odim, pos_enc_class, dropout_rate_embed, dropout_rate, pos_dropout_rate, padding_idx, ): """Build input layer. Args: input_layer (str): input layer type idim (int): input dimension odim (int): output dimension pos_enc_class (class): positional encoding class dropout_rate_embed (float): dropout rate for embedding layer dropout_rate (float): dropout rate for input layer pos_dropout_rate (float): dropout rate for positional encoding padding_idx (int): padding index for embedding input layer (if specified) Returns: (torch.nn.*): input layer module subsampling_factor (int): subsampling factor """ if pos_enc_class.__name__ == "RelPositionalEncoding": pos_enc_class_subsampling = pos_enc_class(odim, pos_dropout_rate) else: pos_enc_class_subsampling = None if input_layer == "linear": return ( torch.nn.Sequential( torch.nn.Linear(idim, odim), torch.nn.LayerNorm(odim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(odim, pos_dropout_rate), ), 1, ) elif input_layer == "conv2d": return Conv2dSubsampling(idim, odim, dropout_rate, pos_enc_class_subsampling), 4 elif input_layer == "vgg2l": return VGG2L(idim, odim, pos_enc_class_subsampling), 4 elif input_layer == "embed": return ( torch.nn.Sequential( torch.nn.Embedding(idim, odim, padding_idx=padding_idx), pos_enc_class(odim, pos_dropout_rate), ), 1, ) elif input_layer == "c-embed": return ( torch.nn.Sequential( torch.nn.Embedding(idim, odim, padding_idx=padding_idx), torch.nn.Dropout(dropout_rate_embed), ), 1, ) else: raise NotImplementedError("Support: linear, conv2d, vgg2l and embed")
def build_input_layer( block: Dict[str, Any], pos_enc_class: torch.nn.Module, padding_idx: int, ) -> Tuple[Union[Conv2dSubsampling, VGG2L, torch.nn.Sequential], int]: """Build input layer. Args: block: Architecture definition of input layer. pos_enc_class: Positional encoding class. padding_idx: Padding symbol ID for embedding layer (if provided). Returns: : Input layer module. subsampling_factor: Subsampling factor. """ input_type = block["type"] idim = block["idim"] odim = block["odim"] dropout_rate = block["dropout-rate"] pos_dropout_rate = block["pos-dropout-rate"] if pos_enc_class.__name__ == "RelPositionalEncoding": pos_enc_class_subsampling = pos_enc_class(odim, pos_dropout_rate) else: pos_enc_class_subsampling = None if input_type == "linear": return ( torch.nn.Sequential( torch.nn.Linear(idim, odim), torch.nn.LayerNorm(odim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(odim, pos_dropout_rate), ), 1, ) elif input_type == "conv2d": return Conv2dSubsampling(idim, odim, dropout_rate, pos_enc_class_subsampling), 4 elif input_type == "vgg2l": return VGG2L(idim, odim, pos_enc_class_subsampling), 4 elif input_type == "embed": return ( torch.nn.Sequential( torch.nn.Embedding(idim, odim, padding_idx=padding_idx), pos_enc_class(odim, pos_dropout_rate), ), 1, ) elif input_type == "c-embed": return ( torch.nn.Sequential( torch.nn.Embedding(idim, odim, padding_idx=padding_idx), torch.nn.Dropout(dropout_rate), ), 1, ) else: raise NotImplementedError( "Invalid input layer: %s. Supported: linear, conv2d, vgg2l and embed" % input_type)
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: str = "conv2d", normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, macaron_style: bool = False, pos_enc_layer_type: str = "rel_pos", selfattention_layer_type: str = "rel_selfattn", activation_type: str = "swish", use_cnn_module: bool = True, cnn_module_kernel: int = 31, padding_idx: int = -1, ): assert check_argument_types() super().__init__() self._output_size = output_size activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate) ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") if selfattention_layer_type == "selfattn": encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = ( attention_heads, output_size, attention_dropout_rate, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( output_size, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size)
def __init__(self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1): """Construct an Encoder object.""" super(Encoder, self).__init__() if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = (attention_dim, linear_units, dropout_rate) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = (attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = (attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate) else: raise NotImplementedError("Support only linear or conv1d.") self.encoders = repeat( num_blocks, lambda: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after)) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, idim, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1, ): """Construct an Encoder object.""" super(Encoder, self).__init__() self._register_load_state_dict_pre_hook(_pre_hook) if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) elif input_layer == "vgg2l": self.embed = VGG2L(idim, attention_dim) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate) ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before positionwise_layer, positionwise_layer_args = self.get_positionwise_layer( positionwise_layer_type, attention_dim, linear_units, dropout_rate, positionwise_conv_kernel_size, ) if selfattention_layer_type == "selfattn": logging.info("encoder self-attention layer type = self-attention") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, MultiHeadedAttention( attention_heads, attention_dim, attention_dropout_rate ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv": logging.info("encoder self-attention layer type = lightweight convolution") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, LightweightConvolution( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv2d": logging.info( "encoder self-attention layer " "type = lightweight convolution 2-dimentional" ) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, LightweightConvolution2D( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv": logging.info("encoder self-attention layer type = dynamic convolution") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, DynamicConvolution( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv2d": logging.info( "encoder self-attention layer type = dynamic convolution 2-dimentional" ) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, DynamicConvolution2D( conv_wshare, attention_dim, attention_dropout_rate, conv_kernel_length, lnum, use_bias=conv_usebias, ), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: Optional[str] = "conv2d", pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 1, padding_idx: int = -1, interctc_layer_idx: List[int] = [], interctc_use_conditioning: bool = False, ): assert check_argument_types() super().__init__() self._output_size = output_size if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(input_size, output_size, dropout_rate) elif input_layer == "conv2d2": self.embed = Conv2dSubsampling2(input_size, output_size, dropout_rate) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6(input_size, output_size, dropout_rate) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8(input_size, output_size, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: if input_size == output_size: self.embed = None else: self.embed = torch.nn.Linear(input_size, output_size) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( output_size, MultiHeadedAttention(attention_heads, output_size, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size) self.interctc_layer_idx = interctc_layer_idx if len(interctc_layer_idx) > 0: assert 0 < min(interctc_layer_idx) and max( interctc_layer_idx) < num_blocks self.interctc_use_conditioning = interctc_use_conditioning self.conditioning_layer = None