def __init__(self, n_cond, n_res, n_dil, n_skp, stride, dil, filter_sz=2, bias=True, parent_rf=None, name=None): ''' filter_sz: # elements in the dilated kernels n_cond: # channels of local condition vectors n_res : # residual channels n_dil : # output channels for dilated kernel n_skp : # channels output to skip connections ''' super(GatedResidualCondConv, self).__init__() self.conv_signal = nn.Conv1d(n_res, n_dil, filter_sz, dilation=dil, bias=bias) self.conv_gate = nn.Conv1d(n_res, n_dil, filter_sz, dilation=dil, bias=bias) self.proj_signal = nn.Conv1d(n_cond, n_dil, kernel_size=1, bias=False) self.proj_gate = nn.Conv1d(n_cond, n_dil, kernel_size=1, bias=False) self.dil_res = nn.Conv1d(n_dil, n_res, kernel_size=1, bias=False) self.dil_skp = nn.Conv1d(n_dil, n_skp, kernel_size=1, bias=False) # The dilated autoregressive convolution produces an output at the # right-most position of the receptive field. (At the very end of a # stack of these, the output corresponds to the position just after # this, but within the stack of convolutions, outputs right-aligned. dil_filter_sz = (filter_sz - 1) * dil + 1 self.rf = rfield.Rfield(filter_info=(dil_filter_sz - 1, 0), parent=parent_rf, name=name) self.beg_rf = None self.end_rf = None self.apply(netmisc.xavier_init)
def __init__(self, n_in_chan, n_out_chan, filter_sz, stride=1, do_res=True, parent_rf=None, name=None): super(ConvReLURes, self).__init__() self.do_res = do_res if self.do_res: if stride != 1: print('Stride must be 1 for residually connected convolution', file=sys.stderr) raise ValueError self.n_in = n_in_chan self.n_out = n_out_chan self.conv = nn.Conv1d(n_in_chan, n_out_chan, filter_sz, stride, padding=0, bias=False) self.relu = nn.ReLU() # self.bn = nn.BatchNorm1d(n_out_chan) self.rf = rfield.Rfield(filter_info=filter_sz, stride=stride, parent=parent_rf, name=name) netmisc.xavier_init(self.conv)
def __init__(self, sample_rate=16000, win_sz=400, hop_sz=160, n_mels=80, n_mfcc=13, name=None): self.sample_rate = sample_rate self.window_sz = win_sz self.hop_sz = hop_sz self.n_mels = n_mels self.n_mfcc = n_mfcc self.n_out = n_mfcc * 3 self.rf = rfield.Rfield(filter_info=self.window_sz, stride=self.hop_sz, parent=None, name=name)
def __init__(self, n_chan, filter_sz, stride, parent_rf, name=None): super(Upsampling, self).__init__() # See upsampling_notes.txt: padding = filter_sz - stride # and: left_offset = left_wing_sz - end_padding end_padding = stride - 1 self.rf = rfield.Rfield(filter_info=filter_sz, stride=stride, padding=(end_padding, end_padding), is_downsample=False, parent=parent_rf, name=name) self.tconv = nn.ConvTranspose1d(n_chan, n_chan, filter_sz, stride, padding=filter_sz - stride)
def __init__(self, filter_sz, n_lc_in, n_lc_out, lc_upsample_filt_sizes, lc_upsample_strides, n_res, n_dil, n_skp, n_post, n_quant, n_blocks, n_block_layers, jitter_prob, n_speakers, n_global_embed, bias=True, parent_rf=None): super(WaveNet, self).__init__() self.n_blocks = n_blocks self.n_block_layers = n_block_layers self.n_quant = n_quant self.quant_onehot = None self.bias = bias self.jitter = Jitter(jitter_prob) post_jitter_filt_sz = 3 lc_input_stepsize = np_prod(lc_upsample_strides) lc_conv_name = 'LC_Conv(filter_size={})'.format(post_jitter_filt_sz) self.lc_conv = Conv1dWrap(n_lc_in, n_lc_out, kernel_size=post_jitter_filt_sz, stride=1, bias=self.bias) cur_rf = rfield.Rfield(filter_info=post_jitter_filt_sz, stride=1, parent=parent_rf, name=lc_conv_name) self.beg_rf = cur_rf # This RF is the first processing of the local conditioning after the # Jitter. It is the starting point for the commitment loss aggregation self.pre_upsample_rf = cur_rf self.lc_upsample = nn.Sequential() # WaveNet is a stand-alone model, so parent_rf is None # The Autoencoder model in model.py will link parent_rfs together. for i, (filt_sz, stride) in enumerate(zip(lc_upsample_filt_sizes, lc_upsample_strides)): name = 'Upsampling_{}(filter_sz={}, stride={})'.format(i, filt_sz, stride) mod = Upsampling(n_lc_out, filt_sz, stride, cur_rf, name=name) self.lc_upsample.add_module(str(i), mod) cur_rf = mod.rf # This rf describes the bounds of the input wav corresponding to the # local conditioning vectors self.last_upsample_rf = cur_rf self.cond = Conditioning(n_speakers, n_global_embed) self.base_layer = Conv1dWrap(n_quant, n_res, kernel_size=1, stride=1, dilation=1, bias=self.bias) self.conv_layers = nn.ModuleList() n_cond = n_lc_out + n_global_embed for b in range(self.n_blocks): for bl in range(self.n_block_layers): dil = 2**bl name = 'GRCC_{},{}(dil={})'.format(b, bl, dil) grc = GatedResidualCondConv(n_cond, n_res, n_dil, n_skp, 1, dil, filter_sz, bias, cur_rf, name) self.conv_layers.append(grc) cur_rf = grc.rf self.last_grcc_rf = cur_rf # Each module in the stack needs to know the dimensions of # the input and output of the overall stack, in order to trim # residual connections beg_grcc_rf = self.conv_layers[0].rf end_grcc_rf = self.conv_layers[-1].rf for mod in self.conv_layers.children(): mod.init_bound_rfs(beg_grcc_rf, end_grcc_rf) self.relu = nn.ReLU() self.post1 = Conv1dWrap(n_skp, n_post, 1, bias=bias) self.post2 = Conv1dWrap(n_post, n_quant, 1, bias=bias) self.logsoftmax = nn.LogSoftmax(1) # (B, Q, N) self.rf = cur_rf