def __init__(self, in_channels, out_channels, hidden_channels, params): super().__init__() self.prenet = Conv1dBN(in_channels, hidden_channels, 1, 1) self.rel_pos_transformer = RelativePositionTransformer( in_channels, out_channels, hidden_channels, **params )
def __init__(self, in_channels, out_channels, hidden_channels, params): super().__init__() self.prenet = ResidualConv1dBNBlock(in_channels, hidden_channels, hidden_channels, kernel_size=5, num_res_blocks=3, num_conv_blocks=1, dilations=[1, 1, 1]) self.rel_pos_transformer = RelativePositionTransformer( hidden_channels, out_channels, hidden_channels, **params)
def __init__( self, n_vocab: int, out_channels: int, hidden_channels: int, hidden_channels_ffn: int, num_heads: int, num_layers: int, kernel_size: int, dropout_p: float, language_emb_dim: int = None, ): """Text Encoder for VITS model. Args: n_vocab (int): Number of characters for the embedding layer. out_channels (int): Number of channels for the output. hidden_channels (int): Number of channels for the hidden layers. hidden_channels_ffn (int): Number of channels for the convolutional layers. num_heads (int): Number of attention heads for the Transformer layers. num_layers (int): Number of Transformer layers. kernel_size (int): Kernel size for the FFN layers in Transformer network. dropout_p (float): Dropout rate for the Transformer layers. """ super().__init__() self.out_channels = out_channels self.hidden_channels = hidden_channels self.emb = nn.Embedding(n_vocab, hidden_channels) nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) if language_emb_dim: hidden_channels += language_emb_dim self.encoder = RelativePositionTransformer( in_channels=hidden_channels, out_channels=hidden_channels, hidden_channels=hidden_channels, hidden_channels_ffn=hidden_channels_ffn, num_heads=num_heads, num_layers=num_layers, kernel_size=kernel_size, dropout_p=dropout_p, layer_norm_type="2", rel_attn_window_size=4, ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def __init__( self, num_chars, out_channels, hidden_channels, hidden_channels_dp, encoder_type, encoder_params, dropout_p_dp=0.1, mean_only=False, use_prenet=True, c_in_channels=0, ): super().__init__() # class arguments self.num_chars = num_chars self.out_channels = out_channels self.hidden_channels = hidden_channels self.hidden_channels_dp = hidden_channels_dp self.dropout_p_dp = dropout_p_dp self.mean_only = mean_only self.use_prenet = use_prenet self.c_in_channels = c_in_channels self.encoder_type = encoder_type # embedding layer self.emb = nn.Embedding(num_chars, hidden_channels) nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5) # init encoder module if encoder_type.lower() == "rel_pos_transformer": if use_prenet: self.prenet = ResidualConv1dLayerNormBlock( hidden_channels, hidden_channels, hidden_channels, kernel_size=5, num_layers=3, dropout_p=0.5 ) self.encoder = RelativePositionTransformer( hidden_channels, hidden_channels, hidden_channels, **encoder_params ) elif encoder_type.lower() == "gated_conv": self.encoder = GatedConvBlock(hidden_channels, **encoder_params) elif encoder_type.lower() == "residual_conv_bn": if use_prenet: self.prenet = nn.Sequential(nn.Conv1d(hidden_channels, hidden_channels, 1), nn.ReLU()) self.encoder = ResidualConv1dBNBlock(hidden_channels, hidden_channels, hidden_channels, **encoder_params) self.postnet = nn.Sequential( nn.Conv1d(self.hidden_channels, self.hidden_channels, 1), nn.BatchNorm1d(self.hidden_channels) ) elif encoder_type.lower() == "time_depth_separable": if use_prenet: self.prenet = ResidualConv1dLayerNormBlock( hidden_channels, hidden_channels, hidden_channels, kernel_size=5, num_layers=3, dropout_p=0.5 ) self.encoder = TimeDepthSeparableConvBlock( hidden_channels, hidden_channels, hidden_channels, **encoder_params ) else: raise ValueError(" [!] Unkown encoder type.") # final projection layers self.proj_m = nn.Conv1d(hidden_channels, out_channels, 1) if not mean_only: self.proj_s = nn.Conv1d(hidden_channels, out_channels, 1) # duration predictor self.duration_predictor = DurationPredictor( hidden_channels + c_in_channels, hidden_channels_dp, 3, dropout_p_dp )