Пример #1
0
def save_log(sess, global_step, model, plot_dir, wav_dir, hparams, model_name):
	log('\nSaving intermediate states at step {}'.format(global_step))
	idx = 0
	y_hat, y, loss, length, input_mel, upsampled_features = sess.run([model.tower_y_hat_log[0][idx], 
		model.tower_y_log[0][idx], 
		model.loss,
		model.tower_input_lengths[0][idx], 
		model.tower_c[0][idx], model.tower_upsampled_local_features[0][idx]])

	#mask by length
	y_hat[length:] = 0
	y[length:] = 0

	#Make audio and plot paths
	pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step))
	target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step))
	plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))
	mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
	upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step))

	#Save audio
	save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
	save_wavenet_wav(y, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)

	#Save figure
	util.waveplot(plot_path, y_hat, y, hparams, title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss))

	#Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
	#Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
	T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value)
	generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
	util.plot_spectrogram(generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format(
		global_step, loss), target_spectrogram=input_mel.T)
	util.plot_spectrogram(upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'.format(
		global_step, loss), auto_aspect=True)
Пример #2
0
def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer, hparams, model_name):
    '''Evaluate model during training.
    Supposes that model variables are averaged.
    '''
    start_time = time.time()
    y_hat, y_target, loss, input_mel, upsampled_features = sess.run([model.tower_y_hat[0], model.tower_y_target[0],
                                                                     model.eval_loss, model.tower_eval_c[0],
                                                                     model.tower_eval_upsampled_local_features[0]])
    duration = time.time() - start_time
    log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)'.format(
        len(y_target), duration, len(y_target) / duration))

    # Make audio and plot paths
    pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step))
    target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step))
    plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))
    mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
    upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step))

    # Save figure
    util.waveplot(plot_path, y_hat, y_target, model._hparams,
                  title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss))
    log('Eval loss for global step {}: {:.3f}'.format(global_step, loss))

    # Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
    # Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
    T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (
    0, hparams.max_abs_value)
    generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
    util.plot_spectrogram(generated_mel, mel_path,
                          title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format(
                              global_step, loss), target_spectrogram=input_mel.T)
    util.plot_spectrogram(upsampled_features.T, upsampled_path,
                          title='Upsampled Local Condition features, step={}, loss={:.5f}'.format(
                              global_step, loss), auto_aspect=True)

    # Save Audio
    save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)
    save_wavenet_wav(y_target, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)

    # Write eval summary to tensorboard
    log('Writing eval summary!')
    add_test_stats(summary_writer, global_step, loss, hparams=hparams)
Пример #3
0
    def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir,
                   log_dir):
        hparams = self._hparams
        local_cond, global_cond = self._check_conditions()

        #Switch mels in case of debug
        if self.synth_debug:
            assert len(hparams.wavenet_debug_mels) == len(
                hparams.wavenet_debug_wavs)
            mel_spectrograms = [
                np.load(mel_file) for mel_file in hparams.wavenet_debug_mels
            ]

        #Get True length of audio to be synthesized: audio_len = mel_len * hop_size
        audio_lengths = [
            len(x) * get_hop_size(self._hparams) for x in mel_spectrograms
        ]

        #Prepare local condition batch
        maxlen = max([len(x) for x in mel_spectrograms])
        #[-max, max] or [0,max]
        T2_output_range = (
            -self._hparams.max_abs_value,
            self._hparams.max_abs_value) if self._hparams.symmetric_mels else (
                0, self._hparams.max_abs_value)

        if self._hparams.clip_for_wavenet:
            mel_spectrograms = [
                np.clip(x, T2_output_range[0], T2_output_range[1])
                for x in mel_spectrograms
            ]

        c_batch = np.stack([
            _pad_inputs(x, maxlen, _pad=T2_output_range[0])
            for x in mel_spectrograms
        ]).astype(np.float32)

        if self._hparams.normalize_for_wavenet:
            #rerange to [0, 1]
            c_batch = _interp(c_batch, T2_output_range).astype(np.float32)

        g = None if speaker_ids is None else np.asarray(
            speaker_ids, dtype=np.int32).reshape(len(c_batch), 1)
        feed_dict = {}

        if local_cond:
            feed_dict[self.local_conditions] = c_batch
        else:
            feed_dict[self.synthesis_length] = 100

        if global_cond:
            feed_dict[self.global_conditions] = g

        if self.synth_debug:
            debug_wavs = hparams.wavenet_debug_wavs
            assert len(debug_wavs) % hparams.wavenet_num_gpus == 0
            test_wavs = [
                np.load(debug_wav).reshape(-1, 1) for debug_wav in debug_wavs
            ]

            #pad wavs to same length
            max_test_len = max([len(x) for x in test_wavs])
            test_wavs = np.stack([
                _pad_inputs(x, max_test_len) for x in test_wavs
            ]).astype(np.float32)

            assert len(test_wavs) == len(debug_wavs)
            feed_dict[self.targets] = test_wavs.reshape(
                len(test_wavs), max_test_len, 1)
            feed_dict[self.input_lengths] = np.asarray([test_wavs.shape[1]])

        #Generate wavs and clip extra padding to select Real speech parts
        generated_wavs, upsampled_features = self.session.run(
            [
                self.model.tower_y_hat,
                self.model.tower_synth_upsampled_local_features
            ],
            feed_dict=feed_dict)

        #Linearize outputs (n_gpus -> 1D)
        generated_wavs = [
            wav for gpu_wavs in generated_wavs for wav in gpu_wavs
        ]
        upsampled_features = [
            feat for gpu_feats in upsampled_features for feat in gpu_feats
        ]

        generated_wavs = [
            generated_wav[:length]
            for generated_wav, length in zip(generated_wavs, audio_lengths)
        ]
        upsampled_features = [
            upsampled_feature[:, :length] for upsampled_feature, length in zip(
                upsampled_features, audio_lengths)
        ]

        audio_filenames = []
        for i, (generated_wav, input_mel, upsampled_feature) in enumerate(
                zip(generated_wavs, mel_spectrograms, upsampled_features)):
            #Save wav to disk
            audio_filename = os.path.join(out_dir,
                                          '{}.wav'.format(basenames[i]))
            save_wavenet_wav(generated_wav,
                             audio_filename,
                             sr=hparams.sample_rate,
                             inv_preemphasize=hparams.preemphasize,
                             k=hparams.preemphasis)
            audio_filenames.append(audio_filename)

            #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
            #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
            generated_mel = melspectrogram(generated_wav, hparams).T
            util.plot_spectrogram(
                generated_mel,
                os.path.join(
                    log_dir,
                    'wavenet-mel-spectrogram-{}.png'.format(basenames[i])),
                title=
                'Local Condition vs Reconstructed Audio Mel-Spectrogram analysis',
                target_spectrogram=input_mel)
            #Save upsampled features to visualize checkerboard artifacts.
            util.plot_spectrogram(
                upsampled_feature.T,
                os.path.join(
                    log_dir,
                    'wavenet-upsampled_features-{}.png'.format(basenames[i])),
                title='Upmsampled Local Condition features',
                auto_aspect=True)

            #Save waveplot to disk
            if log_dir is not None:
                plot_filename = os.path.join(
                    log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i]))
                util.waveplot(plot_filename,
                              generated_wav,
                              None,
                              hparams,
                              title='WaveNet generated Waveform.')

        return audio_filenames