def __init__( self, channels, hidden_channels, kernel_size, dilation_rate, num_layers, dropout_p=0, cond_channels=0, mean_only=False, ): assert channels % 2 == 0, "channels should be divisible by 2" super().__init__() self.half_channels = channels // 2 self.mean_only = mean_only # input layer self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) # coupling layers self.enc = WN( hidden_channels, hidden_channels, kernel_size, dilation_rate, num_layers, dropout_p=dropout_p, c_in_channels=cond_channels, ) # output layer # Initializing last layer to 0 makes the affine coupling layers # do nothing at first. This helps with training stability self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) self.post.weight.data.zero_() self.post.bias.data.zero_()
def __init__( self, in_channels: int, out_channels: int, hidden_channels: int, kernel_size: int, dilation_rate: int, num_layers: int, cond_channels=0, ): """Posterior Encoder of VITS model. :: x -> conv1x1() -> WaveNet() (non-causal) -> conv1x1() -> split() -> [m, s] -> sample(m, s) -> z Args: in_channels (int): Number of input tensor channels. out_channels (int): Number of output tensor channels. hidden_channels (int): Number of hidden channels. kernel_size (int): Kernel size of the WaveNet convolution layers. dilation_rate (int): Dilation rate of the WaveNet layers. num_layers (int): Number of the WaveNet layers. cond_channels (int, optional): Number of conditioning tensor channels. Defaults to 0. """ super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.dilation_rate = dilation_rate self.num_layers = num_layers self.cond_channels = cond_channels self.pre = nn.Conv1d(in_channels, hidden_channels, 1) self.enc = WN( hidden_channels, hidden_channels, kernel_size, dilation_rate, num_layers, c_in_channels=cond_channels, ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)