def _convert_type(self, inputs): if utils.is_mulaw_quantize(self.hp.input_type): inputs = utils.mulaw_quantize(inputs, self.hp.quantize_channels) inputs = tf.one_hot(tf.cast(inputs, tf.int32), self.hp.quantize_channels) else: inputs = tf.expand_dims(inputs, axis=-1) return inputs
def body(time, current_inputs, final_outputs, current_input_buffers, current_c_buffers): # we need shift condition by one current_c = c[:, time:time + 1, :] if c is not None else None current_outputs = current_inputs new_input_buffers = [] new_c_buffers = [] for layer, current_input_buffer, current_c_buffer in zip( self.fft_layers, current_input_buffers, current_c_buffers): current_outputs, out_input_buffer, out_c_buffer = layer.incremental_forward( inputs=current_outputs, c=current_c, input_buffers=current_input_buffer, c_buffers=current_c_buffer, ) new_input_buffers.append(out_input_buffer) new_c_buffers.append(out_c_buffer) current_outputs = self.out_layer(current_outputs) posterior = tf.nn.softmax(tf.reshape(current_outputs, [1, -1]), axis=-1) # dist = tf.distributions.Categorical(probs=posterior) # sample = tf.cast(dist.sample(), tf.int32) sample = tf.py_func(np.random.choice, [ np.arange(self.hp.quantize_channels), 1, True, tf.reshape(posterior, [-1]) ], tf.int64) sample = tf.reshape(sample, [-1]) # sample = tf.argmax(posterior, axis=-1) decode_sample = utils.inv_mulaw_quantize(sample, self.hp.quantize_channels) final_outputs = final_outputs.write(time, decode_sample) if utils.is_mulaw_quantize(self.hp.input_type): next_sample = tf.one_hot(tf.cast(sample, tf.int32), self.hp.quantize_channels) else: next_sample = decode_sample next_time = time + 1 next_inputs = current_inputs[:, 1:, :] if test_inputs is not None: next_sample = tf.reshape(test_inputs[:, next_time], [1, 1, self.in_channels]) else: next_sample = tf.reshape(next_sample, [1, 1, self.in_channels]) next_inputs = tf.concat( [next_inputs, tf.cast(next_sample, tf.float32)], axis=1) return next_time, next_inputs, final_outputs, new_input_buffers, new_c_buffers
def __init__(self, hp): self.hp = hp self.receptive_filed = 2**hp.n_layers # fft layer self.fft_layers = [] if utils.is_mulaw_quantize(self.hp.input_type): pad_value = 128 self.in_channels = 256 else: pad_value = 0 self.in_channels = 1 for idx in range(0, hp.n_layers): layer_index = hp.n_layers - idx if idx == 0: self.fft_layers += [ FFTLayer(self.in_channels, hp.hidden_channels, layer_index, hp.cin_channels, pad_value, name='fft_layer_{}'.format(idx)) ] else: self.fft_layers += [ FFTLayer(hp.hidden_channels, hp.hidden_channels, layer_index, hp.cin_channels, pad_value, name='fft_layer_{}'.format(idx)) ] self.out_layer = tf.layers.Dense(units=hp.quantize_channels, name='out_dense') # upsample conv if hp.upsample_conditional_features: self.upsample_conv = [] for i, s in enumerate(hp.upsample_scales): convt = ConvTransposed2d( 1, s, hp.freq_axis_kernel_size, padding='same', strides=(s, 1), scope='local_conditioning_upsample_{}'.format(i + 1)) self.upsample_conv.append(convt) else: self.upsample_conv = None print('Receptive Field: %i samples' % self.receptive_filed) print('pad value: {}'.format(pad_value))
def get_one_example(self): for meta in self._metadata: audio_file = meta[0] input_data = np.load(os.path.join(self.data_dir, audio_file)) if self.use_local: mel_file = meta[1] local_feature = np.load(os.path.join(self.data_dir, mel_file)) else: local_feature = False # ===== To Do ===== # global_feature = False # adjust time step for local condition max_time_step = self._limit_time() input_data, local_feature = self._adjust_time_step(input_data, local_feature, max_time_step) # make sure that target is under mu law encode if utils.is_mulaw_quantize(self._hparams.input_type): target_data = input_data else: target_data = utils.mulaw_quantize(input_data, self._hparams.quantize_channels) input_length = len(input_data) yield input_data, target_data, input_length, local_feature, global_feature
def forward(self, inputs, targets=None, c=None, g=None): if g is not None: raise NotImplementedError("global condition is not added now!") # the rank of inputs is 2 if utils.is_mulaw_quantize(self.hp.input_type): inputs = tf.one_hot(tf.cast(inputs, tf.int32), self.hp.quantize_channels) else: inputs = tf.expand_dims(inputs, axis=-1) with tf.control_dependencies([tf.assert_equal(tf.rank(inputs), 3)]): outputs = tf.identity(inputs) self.targets = tf.cast(targets, tf.int32) # check whether need to upsample local condition if c is not None and self.upsample_conv is not None: c = tf.expand_dims(c, axis=-1) # [B T cin_channels 1] for transposed_conv in self.upsample_conv: c = transposed_conv(c) c = tf.squeeze(c, axis=-1) # [B new_T cin_channels] # for training, we need to use previous samples and the condition of next sample (shift by one) outputs = outputs[:, :-1, :] if c is not None: c = c[:, 1:, :] with tf.control_dependencies( [tf.assert_equal(tf.shape(outputs)[1], tf.shape(c)[1])]): c = tf.identity(c) for layer in self.fft_layers: outputs = layer(outputs, c=c) outputs = self.out_layer(outputs) self.outputs = outputs self.log_outputs = tf.argmax(tf.nn.softmax(self.outputs, axis=-1), axis=-1)
def _process_utterance(out_dir, index, speaker_id, wav_path, text, hparams): # Load the audio to a numpy array. Resample if needed wav = audio.load_wav(wav_path) if hparams.use_injected_noise: noise = np.random.normal(0.0, 1.0 / hparams.quantize_channels, wav.shape) wav += noise wav, _ = librosa.effects.trim(wav, top_db=20) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'cmu_arctic-audio-%05d.npy' % index mel_filename = 'cmu_arctic-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return audio_filename, mel_filename, timesteps, text, speaker_id
def incremental_forward(self, c=None, g=None, test_inputs=None, targets=None): if g is not None: raise NotImplementedError("global condition is not added now!") # use the zero as inputs inputs = tf.zeros([1, 1], dtype=tf.float32) if utils.is_mulaw_quantize(self.hp.input_type): inputs = utils.mulaw_quantize(inputs, self.hp.quantize_channels) inputs = tf.one_hot(tf.cast(inputs, tf.int32), self.hp.quantize_channels) else: inputs = tf.expand_dims(inputs, axis=-1) # check whether need to upsample condition if c is not None and self.upsample_conv is not None: c = tf.expand_dims(c, axis=-1) # [B T cin_channels 1] for transposed_conv in self.upsample_conv: c = transposed_conv(c) c = tf.squeeze(c, axis=-1) # [B new_T cin_channels] # apply zero padding to condition if c is not None: c_shape = tf.shape(c) padding_c = tf.zeros( [c_shape[0], self.receptive_filed, c_shape[-1]]) c = tf.concat([padding_c, c], axis=1) # create c_buffers c_buffers = [ tf.zeros([1, 2**i // 2 + 1, self.hp.cin_channels]) for i in range(self.hp.n_layers, 0, -1) ] synthesis_length = tf.shape(c)[1] initial_time = tf.constant(0, dtype=tf.int32) initial_outputs_ta = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True) input_buffers = [ self._convert_type(tf.zeros([1, 2**self.hp.n_layers // 2 + 1])) ] for i in range(self.hp.n_layers - 1, 0, -1): input_buffers.append( self._convert_type(tf.zeros([1, 2**i // 2 + 1]))) def condition(time, unused_initial_input, unused_final_outputs, unused_input_buffers, unused_c_buffers): return tf.less(time, synthesis_length) def body(time, current_inputs, final_outputs, current_input_buffers, current_c_buffers): # we need shift condition by one current_c = c[:, time:time + 1, :] if c is not None else None current_outputs = current_inputs new_input_buffers = [] new_c_buffers = [] for layer, current_input_buffer, current_c_buffer in zip( self.fft_layers, current_input_buffers, current_c_buffers): current_outputs, out_input_buffer, out_c_buffer = layer.incremental_forward( inputs=current_outputs, c=current_c, input_buffers=current_input_buffer, c_buffers=current_c_buffer, ) new_input_buffers.append(out_input_buffer) new_c_buffers.append(out_c_buffer) current_outputs = self.out_layer(current_outputs) posterior = tf.nn.softmax(tf.reshape(current_outputs, [1, -1]), axis=-1) # dist = tf.distributions.Categorical(probs=posterior) # sample = tf.cast(dist.sample(), tf.int32) sample = tf.py_func(np.random.choice, [ np.arange(self.hp.quantize_channels), 1, True, tf.reshape(posterior, [-1]) ], tf.int64) sample = tf.reshape(sample, [-1]) # sample = tf.argmax(posterior, axis=-1) decode_sample = utils.inv_mulaw_quantize(sample, self.hp.quantize_channels) final_outputs = final_outputs.write(time, decode_sample) if utils.is_mulaw_quantize(self.hp.input_type): next_sample = tf.one_hot(tf.cast(sample, tf.int32), self.hp.quantize_channels) else: next_sample = decode_sample next_time = time + 1 next_inputs = current_inputs[:, 1:, :] if test_inputs is not None: next_sample = tf.reshape(test_inputs[:, next_time], [1, 1, self.in_channels]) else: next_sample = tf.reshape(next_sample, [1, 1, self.in_channels]) next_inputs = tf.concat( [next_inputs, tf.cast(next_sample, tf.float32)], axis=1) return next_time, next_inputs, final_outputs, new_input_buffers, new_c_buffers result = tf.while_loop(condition, body, loop_vars=[ initial_time, inputs, initial_outputs_ta, input_buffers, c_buffers ], parallel_iterations=32, swap_memory=True) outputs_ta = result[2] outputs = outputs_ta.stack() self.eval_outputs = outputs self.eval_targets = utils.inv_mulaw_quantize( targets, self.hp.quantize_channels) if targets is not None else None