Пример #1
0
def save_log(sess, global_step, model, plot_dir, wav_dir, hparams, model_name):
	log('\nSaving intermediate states at step {}'.format(global_step))
	idx = 0
	y_hat, y, loss, length, input_mel, upsampled_features = sess.run([model.tower_y_hat_log[0][idx], 
		model.tower_y_log[0][idx], 
		model.loss,
		model.tower_input_lengths[0][idx], 
		model.tower_c[0][idx], model.tower_upsampled_local_features[0][idx]])

	#mask by length
	y_hat[length:] = 0
	y[length:] = 0

	#Make audio and plot paths
	pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step))
	target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step))
	plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))
	mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
	upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step))

	#Save audio
	save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
	save_wavenet_wav(y, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)

	#Save figure
	util.waveplot(plot_path, y_hat, y, hparams, title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss))

	#Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
	#Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
	T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value)
	generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
	util.plot_spectrogram(generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format(
		global_step, loss), target_spectrogram=input_mel.T)
	util.plot_spectrogram(upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'.format(
		global_step, loss), auto_aspect=True)
Пример #2
0
	def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir, log_dir):
		hparams = self._hparams
		local_cond, global_cond = self._check_conditions()

		#Get True length of audio to be synthesized: audio_len = mel_len * hop_size
		audio_lengths = [len(x) * get_hop_size(self._hparams) for x in mel_spectrograms]

		#Prepare local condition batch
		maxlen = max([len(x) for x in mel_spectrograms])
		#[-max, max] or [0,max]
		T2_output_range = (-self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else (0, self._hparams.max_abs_value)

		if self._hparams.clip_for_wavenet:
			mel_spectrograms = [np.clip(x, T2_output_range[0], T2_output_range[1]) for x in mel_spectrograms]

		c_batch = np.stack([_pad_inputs(x, maxlen, _pad=T2_output_range[0]) for x in mel_spectrograms]).astype(np.float32)

		if self._hparams.normalize_for_wavenet:
			#rerange to [0, 1]
			c_batch = np.interp(c_batch, T2_output_range, (0, 1))

		g = None if speaker_ids is None else np.asarray(speaker_ids, dtype=np.int32).reshape(len(c_batch), 1)
		feed_dict = {}

		if local_cond:
			feed_dict[self.local_conditions] = c_batch
		else:
			feed_dict[self.synthesis_length] = 100

		if global_cond:
			feed_dict[self.global_conditions] = g

		#Generate wavs and clip extra padding to select Real speech parts
		generated_wavs = self.session.run(self.model.y_hat, feed_dict=feed_dict)
		generated_wavs = [generated_wav[:length] for generated_wav, length in zip(generated_wavs, audio_lengths)]

		audio_filenames = []
		for i, generated_wav in enumerate(generated_wavs):
			#Save wav to disk
			audio_filename = os.path.join(out_dir, 'wavenet-audio-{}.wav'.format(basenames[i]))
			save_wavenet_wav(generated_wav, audio_filename, sr=hparams.sample_rate)
			audio_filenames.append(audio_filename)

			#Save waveplot to disk
			if log_dir is not None:
				plot_filename = os.path.join(log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i]))
				util.waveplot(plot_filename, generated_wav, None, hparams)

		return audio_filenames
Пример #3
0
def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer, hparams, model_name):
	'''Evaluate model during training.
	Supposes that model variables are averaged.
	'''
	start_time = time.time()
	y_hat, y_target, loss, input_mel, upsampled_features = sess.run([model.tower_y_hat[0], model.tower_y_target[0],
		model.eval_loss, model.tower_eval_c[0], model.tower_eval_upsampled_local_features[0]])
	duration = time.time() - start_time
	log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)'.format(
		len(y_target), duration, len(y_target)/duration))

	#Make audio and plot paths
	pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step))
	target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step))
	plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))
	mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
	upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step))

	#Save Audio
	save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
	save_wavenet_wav(y_target, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)

	#Save figure
	util.waveplot(plot_path, y_hat, y_target, model._hparams, title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss))
	log('Eval loss for global step {}: {:.3f}'.format(global_step, loss))

	#Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
	#Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
	T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value)
	generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
	util.plot_spectrogram(generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format(
		global_step, loss), target_spectrogram=input_mel.T)
	util.plot_spectrogram(upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'.format(
		global_step, loss), auto_aspect=True)

	#Write eval summary to tensorboard
	log('Writing eval summary!')
	add_test_stats(summary_writer, global_step, loss, hparams=hparams)
Пример #4
0
    def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir,
                   log_dir):
        hparams = self._hparams
        local_cond, global_cond = self._check_conditions()

        #Switch mels in case of debug
        # if self.synth_debug:
        # 	assert len(hparams.wavenet_debug_mels) == len(hparams.wavenet_debug_wavs)
        # 	mel_spectrograms = [np.load(mel_file) for mel_file in hparams.wavenet_debug_mels]

        #Get True length of audio to be synthesized: audio_len = mel_len * hop_size
        audio_lengths = [
            len(x) * get_hop_size(self._hparams) for x in mel_spectrograms
        ]

        #Prepare local condition batch
        maxlen = max([len(x) for x in mel_spectrograms])
        #[-max, max] or [0,max]
        T2_output_range = (
            -self._hparams.max_abs_value,
            self._hparams.max_abs_value) if self._hparams.symmetric_mels else (
                0, self._hparams.max_abs_value)

        if self._hparams.clip_for_wavenet:
            mel_spectrograms = [
                np.clip(x, T2_output_range[0], T2_output_range[1])
                for x in mel_spectrograms
            ]

        c_batch = np.stack([
            _pad_inputs(x, maxlen, _pad=T2_output_range[0])
            for x in mel_spectrograms
        ]).astype(np.float32)

        if self._hparams.normalize_for_wavenet:
            #rerange to [0, 1]
            c_batch = np.interp(c_batch, T2_output_range, (0, 1))

        g = None if speaker_ids is None else np.asarray(
            speaker_ids, dtype=np.int32).reshape(len(c_batch), 1)
        feed_dict = {}

        if local_cond:
            feed_dict[self.local_conditions] = c_batch
        else:
            feed_dict[self.synthesis_length] = 100

        if global_cond:
            feed_dict[self.global_conditions] = g

        # if self.synth_debug:
        # 	debug_wavs = hparams.wavenet_debug_wavs
        # 	assert len(debug_wavs) % hparams.wavenet_num_gpus == 0
        # 	test_wavs = [np.load(debug_wav).reshape(-1, 1) for debug_wav in debug_wavs]
        #
        # 	#pad wavs to same length
        # 	max_test_len = max([len(x) for x in test_wavs])
        # 	test_wavs = np.stack([_pad_inputs(x, max_test_len) for x in test_wavs]).astype(np.float32)
        #
        # 	assert len(test_wavs) == len(debug_wavs)
        # 	feed_dict[self.targets] = test_wavs.reshape(len(test_wavs), max_test_len, 1)
        # 	feed_dict[self.input_lengths] = np.asarray([test_wavs.shape[1]])

        #Generate wavs and clip extra padding to select Real speech parts
        #generated_wavs, upsampled_features = self.session.run([self.model.tower_y_hat, self.model.tower_synth_upsampled_local_features], feed_dict=feed_dict)

        #Linearize outputs (n_gpus -> 1D)
        #generated_wavs = [wav for gpu_wavs in generated_wavs for wav in gpu_wavs]
        #upsampled_features = [feat for gpu_feats in upsampled_features for feat in gpu_feats]

        #generated_wavs = [generated_wav[:length] for generated_wav, length in zip(generated_wavs, audio_lengths)]
        #upsampled_features = [upsampled_feature[:, :length] for upsampled_feature, length in zip(upsampled_features, audio_lengths)]

        generated_wavs = self.session.run(self.model.y_hat,
                                          feed_dict=feed_dict)
        generated_wavs = [
            generated_wav[:length]
            for generated_wav, length in zip(generated_wavs, audio_lengths)
        ]

        audio_filenames = []
        for i, generated_wav in enumerate(generated_wavs):
            #Save wav to disk
            audio_filename = os.path.join(
                out_dir, 'wavenet-audio-{}.wav'.format(basenames[i]))
            save_wavenet_wav(generated_wav,
                             audio_filename,
                             sr=hparams.sample_rate)
            audio_filenames.append(audio_filename)

            #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
            #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
            # generated_mel = melspectrogram(generated_wav, hparams).T
            # util.plot_spectrogram(generated_mel, os.path.join(log_dir, 'wavenet-mel-spectrogram-{}.png'.format(basenames[i])),
            # 	title='Local Condition vs Reconstructed Audio Mel-Spectrogram analysis', target_spectrogram=input_mel)
            # #Save upsampled features to visualize checkerboard artifacts.
            # util.plot_spectrogram(upsampled_feature.T, os.path.join(log_dir, 'wavenet-upsampled_features-{}.png'.format(basenames[i])),
            # 	title='Upmsampled Local Condition features', auto_aspect=True)

            #Save waveplot to disk
            if log_dir is not None:
                plot_filename = os.path.join(
                    log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i]))
                util.waveplot(plot_filename,
                              generated_wav,
                              None,
                              hparams,
                              title='WaveNet generated Waveform.')

        return audio_filenames