def _clip_quant_scale(x, quant_chann, use_mu_law): x = tf.clip_by_value(x, -1.0, 1.0 - 2.0 / quant_chann) # Remove the values unseen in data. if use_mu_law: # suppose x is mu_law encoded audio signal in [-1, 1) x_quantized = utils.cast_quantize(x, quant_chann) x_scaled = utils.inv_mu_law(x_quantized) else: # suppose x is real audio signal in [-1, 1) x_quantized = utils.cast_quantize(x, quant_chann) x_scaled = utils.inv_cast_quantize(x_quantized, quant_chann) return x_scaled
def gauss_sample(gauss_params, quant_chann, use_log_scales=True): mean, std = mean_std_from_out_params(gauss_params, use_log_scales) distribution = Normal(loc=mean, scale=std) x = distribution.sample() x = tf.clip_by_value(x, -1., 1. - 2. / quant_chann) x_quantized = utils.cast_quantize(x, quant_chann) return x_quantized
def encode_signal(self, inputs): ### # Encode the source with 8-bit Mu-Law or just use 16-bit signal. ### quant_chann = self.quant_chann use_mu_law = self.use_mu_law x = inputs['wav'] if use_mu_law: x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / (quant_chann / 2.) real_targets = x_scaled cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast( quant_chann / 2., tf.int32) else: x_quantized = utils.cast_quantize(x, quant_chann) x_scaled = x real_targets = x cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast( quant_chann / 2., tf.int32) return { 'wav_scaled': x_scaled, 'real_targets': real_targets, 'cate_targets': cate_targets }
def mol_sample(mol_params, quant_chann, use_log_scales=True): """ Args: mol_params: [batch_size, 1, number of mixture * 3] quant_chann: quantization channels (2 ** 8 or 2 ** 16) use_log_scales: scale parameters is in log scale or linear scale. Returns: x_quantized: [batch_size, 1], x_quantized is casted to [-quant_chann / 2, quant_chann / 2) """ logit_probs, means, scale_params = tf.split( mol_params, num_or_size_splits=3, axis=2) nr_mix = mol_params.get_shape().as_list()[2] // 3 ru = tf.random_uniform(tf.shape(logit_probs), minval=1e-5, maxval=1. - 1e-5) sel = tf.one_hot( tf.argmax(logit_probs - tf.log(-tf.log(ru)), axis=2), depth=nr_mix, dtype=tf.float32) means = tf.reduce_sum(means * sel, axis=2) if use_log_scales: log_scales = tf.clip_by_value( tf.reduce_sum(scale_params * sel, axis=2), -7.0, 7.0) scales = tf.exp(log_scales) else: scales = tf.clip_by_value( tf.reduce_sum(scale_params * sel, axis=2), tf.exp(-7.0), tf.exp(7.0)) ru2 = tf.random_uniform(tf.shape(means), minval=1e-5, maxval=1. - 1e-5) x = means + scales * (tf.log(ru2) - tf.log(1. - ru2)) x = tf.clip_by_value(x, -1., 1. - 2. / quant_chann) x_quantized = utils.cast_quantize(x, quant_chann) return x_quantized
def encode_signal(self, inputs, add_noise=False): ### # Encode the source with 8-bit Mu-Law or just use 16-bit signal. ### quant_chann = self.quant_chann use_mu_law = self.use_mu_law x = inputs['wav'] if use_mu_law: x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / (quant_chann / 2.) real_targets = x_scaled cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast(quant_chann / 2., tf.int32) else: x_quantized = utils.cast_quantize(x, quant_chann) x_scaled = x real_targets = x cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast(quant_chann / 2., tf.int32) if add_noise: # only used when the wavenet is trained as a teacher. x_scaled += tf.random_normal(shape=x_scaled.get_shape(), mean=0.0, stddev=0.1) return {'wav_scaled': x_scaled, 'real_targets': real_targets, 'cate_targets': cate_targets}
def mol_sample_(mol_params, quant_chann, use_log_scales=True): logit_probs, means, scale_params = tf.split( mol_params, num_or_size_splits=3, axis=2) nr_mix = mol_params.get_shape().as_list()[2] // 3 sel = tf.one_hot(tf.argmax(logit_probs, axis=2), depth=nr_mix, dtype=tf.float32) x = tf.reduce_sum(means * sel, axis=2) x = tf.clip_by_value(x, -1., 1. - 2. / quant_chann) x_quantized = utils.cast_quantize(x, quant_chann) return x_quantized
def mog_sample(mog_params, quant_chann, use_log_scales=True): distribution = mog_from_out_params(mog_params, use_log_scales) x = distribution.sample() x = tf.clip_by_value(x, -1., 1. - 2. / quant_chann) x_quantized = utils.cast_quantize(x, quant_chann) return x_quantized
def feed_forward(self, inputs): """Build the graph for this configuration. Args: inputs: A dict of inputs. For training, should contain 'wav'. Returns: A dict of outputs that includes the 'predictions', 'loss', the 'encoding', the 'quantized_input', and whatever metrics we want to track for eval. """ num_stages = self.hparams.num_stages num_layers = self.hparams.num_layers filter_length = self.hparams.filter_length width = self.hparams.width skip_width = self.hparams.skip_width use_mu_law = self.use_mu_law quant_chann = self.quant_chann out_width = self.out_width ### # The Transpose Convolution Stack for mel feature. ### # wavenet inputs <- trans_conv (l2, s2) <- trans_conv (l1, s1) <- mel_ceps # win_len: l1 * s2 + (l2 - s2); win_shift: s1 * s2 # (l1, s1) = (40, 10), (l2, s2) = (80, 20) is a proper configuration. # it is almost consistent with mel analysis frame shift (200) and frame length (800). mel = inputs['mel'] ds_dict = self.deconv_stack({'mel': mel}) mel_en = ds_dict['encoding'] ### # Encode the source with 8-bit Mu-Law or just use 16-bit signal. ### x = inputs['wav'] if use_mu_law: x_quantized = utils.mu_law(x) x_scaled = tf.cast(x_quantized, tf.float32) / (quant_chann / 2.) real_targets = x_scaled cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast( quant_chann / 2., tf.int32) else: x_quantized = utils.cast_quantize(x, quant_chann) x_scaled = x real_targets = x cate_targets = tf.cast(x_quantized, tf.int32) + tf.cast( quant_chann / 2., tf.int32) x_scaled = tf.expand_dims(x_scaled, 2) ### # The WaveNet Decoder. ### l = masked.shift_right(x_scaled) l = masked.conv1d(l, num_filters=width, filter_length=filter_length, name='startconv') # Set up skip connections. s = masked.conv1d(l, num_filters=skip_width, filter_length=1, name='skip_start') # Residual blocks with skip connections. for i in range(num_layers): dilation = 2**(i % num_stages) d = masked.conv1d(l, num_filters=2 * width, filter_length=filter_length, dilation=dilation, name='dilated_conv_%d' % (i + 1)) c = masked.conv1d(mel_en, num_filters=2 * width, filter_length=1, name='mel_cond_%d' % (i + 1)) d = _condition(d, c) assert d.get_shape().as_list()[2] % 2 == 0 m = d.get_shape().as_list()[2] // 2 d_sigmoid = tf.sigmoid(d[:, :, :m]) d_tanh = tf.tanh(d[:, :, m:]) d = d_sigmoid * d_tanh l += masked.conv1d(d, num_filters=width, filter_length=1, name='res_%d' % (i + 1)) s += masked.conv1d(d, num_filters=skip_width, filter_length=1, name='skip_%d' % (i + 1)) s = tf.nn.relu(s) s = masked.conv1d(s, num_filters=skip_width, filter_length=1, name='out1') c = masked.conv1d(mel_en, num_filters=skip_width, filter_length=1, name='mel_cond_out1') s = _condition(s, c) s = tf.nn.relu(s) # when using mol loss, the model always predicts log_scale, the initializer makes # the log_scale in a reasonable small range to speed up convergence. final_kernel_init = (tf.truncated_normal_initializer(0.0, 0.01) if self.loss_type == 'mol' else tf.uniform_unit_scaling_initializer(1.0)) out = masked.conv1d(s, num_filters=out_width, filter_length=1, name='out2', kernel_initializer=final_kernel_init) return { 'real_targets': real_targets, 'cate_targets': cate_targets, 'encoding': mel_en, 'out_params': out }