def save_current_model(args, checkpoint_path, global_step, hparams, loss, model, plot_dir, saver, sess, step, wav_dir): # Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) input_seq, mel_prediction, linear_prediction, attention_mask_sample, targets_mel, target_length, linear_target = sess.run( [ model.inputs[0], model.post_net_predictions[0], model.mag_pred[0], model.alignments[0], model.targets_mel[0], model.targets_length[0], model.targets_mag[0], ]) alignments, alignment_titles = get_alignments(attention_mask_sample) # save griffin lim inverted wav for debug (linear -> wav) wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, '{}-linear.wav'.format(step)), sr=hparams.sample_rate) # Save real and predicted linear-spectrogram plot to disk (control purposes) plot.plot_spectrogram( linear_prediction, os.path.join(plot_dir, '{}-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=linear_target, max_len=target_length, auto_aspect=True) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, '{}-mel.wav'.format(step)), sr=hparams.sample_rate) # save alignment plot to disk (control purposes) for i in range(len(alignments)): plot.plot_alignment( alignments[i], os.path.join(plot_dir, '{}_{}-align.png'.format(step, alignment_titles[i])), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), max_len=target_length // hparams.reduction_factor) # save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, '{}-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=targets_mel, max_len=target_length) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))
def plot_graph_and_save_audio(args, base_path=None, start_of_sentence=None, end_of_sentence=None, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, use_manual_attention=False, save_alignment=False, librosa_trim=False, attention_trim=False, time_str=None, isKorean=True): idx, (wav, alignment, path, text, sequence, mel) = args if base_path: plot_path = "{}/{}.png".format(base_path, get_time()) elif path: plot_path = path.rsplit('.', 1)[0] + ".png" else: plot_path = None #plot_path = add_prefix(plot_path, time_str) if use_manual_attention: plot_path = add_postfix(plot_path, "manual") if plot_path: plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) if use_short_concat: wav = short_concat(wav, alignment, text, start_of_sentence, end_of_sentence, pre_word_num, post_word_num, pre_surplus_idx, post_surplus_idx) if attention_trim and end_of_sentence: end_idx_counter = 0 attention_argmax = alignment.argmax(0) end_idx = min(len(sequence) - 1, max(attention_argmax)) max_counter = min((attention_argmax == end_idx).sum(), 5) for jdx, attend_idx in enumerate(attention_argmax): if len(attention_argmax) > jdx + 1: if attend_idx == end_idx: end_idx_counter += 1 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: break if end_idx_counter >= max_counter: break else: break spec_end_idx = hparams.reduction_factor * jdx + 3 wav = wav[:spec_end_idx] mel = mel[:spec_end_idx] audio_out = inv_linear_spectrogram(wav.T, hparams) if librosa_trim and end_of_sentence: yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50) audio_out = audio_out[:index[-1]] mel = mel[:index[-1] // hparams.hop_size] if save_alignment: alignment_path = "{}/{}.npy".format(base_path, idx) np.save(alignment_path, alignment, allow_pickle=False) if path or base_path: if path: current_path = add_postfix(path, idx) elif base_path: current_path = plot_path.replace(".png", ".wav") save_wav(audio_out, current_path, hparams.sample_rate) #hccho mel_path = current_path.replace(".wav", ".npy") np.save(mel_path, mel) return True else: io_out = io.BytesIO() save_wav(audio_out, io_out, hparams.sample_rate) result = io_out.getvalue() return result
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*- import os import numpy as np from utils import audio from hparams import hparams as hps linear_path = './data/linear-000001.npy' linear_name = linear_path.split('/')[-1].split('.')[0] linear_p = np.load(linear_path) mel_path = r'./data/mel-000001.npy' mel_name = mel_path.split('/')[-1].split('.')[0] mel_p = np.load(mel_path) # 保存线性频谱 wav = audio.inv_linear_spectrogram(linear_p.T, hps) audio.save_wav(wav, os.path.join("./data", "{}.wav".format(linear_name)), hps) # 保存mel频谱 wav = audio.inv_mel_spectrogram(mel_p.T, hps) audio.save_wav(wav, os.path.join("./data", "{}.wav".format(mel_name)), hps)
def run_eval(args, eval_dir, eval_model, eval_plot_dir, eval_wav_dir, feeder, hparams, sess, step, summary_writer): # Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) sum_eval_loss = 0.0 sum_mel_loss = 0.0 sum_stop_token_loss = 0.0 sum_linear_loss = 0.0 count = 0.0 mel_p = None mel_t = None t_len = None attention_mask_sample = None lin_p = None lin_t = None for _ in tqdm(range(feeder.test_steps)): test_eloss, test_mel_loss, test_stop_token_loss, test_linear_loss, mel_p, mel_t, t_len, attention_mask_sample, lin_p, lin_t = sess.run( [ eval_model.loss, eval_model.mel_loss, eval_model.stop_token_loss, eval_model.linear_loss, eval_model.post_net_predictions[0], eval_model.targets_mel[0], eval_model.targets_length[0], eval_model.alignments[0], eval_model.mag_pred[0], eval_model.targets_mag[0], ]) sum_eval_loss += test_eloss sum_mel_loss += test_mel_loss sum_stop_token_loss += test_stop_token_loss sum_linear_loss += test_linear_loss count += 1.0 wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, '{}-eval-linear.wav'.format(step)), sr=hparams.sample_rate) if count > 0.0: eval_loss = sum_eval_loss / count mel_loss = sum_mel_loss / count stop_token_loss = sum_stop_token_loss / count linear_loss = sum_linear_loss / count else: eval_loss = sum_eval_loss mel_loss = sum_mel_loss stop_token_loss = sum_stop_token_loss linear_loss = sum_linear_loss log('Saving eval log to {}..'.format(eval_dir)) # Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, '{}-eval-mel.wav'.format(step)), sr=hparams.sample_rate) alignments, alignment_titles = get_alignments(attention_mask_sample) for i in range(len(alignments)): plot.plot_alignment(alignments[i], os.path.join( eval_plot_dir, '{}_{}-eval-align.png'.format( step, alignment_titles[i])), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), max_len=t_len // hparams.reduction_factor) plot.plot_spectrogram( mel_p, os.path.join(eval_plot_dir, '{}-eval-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=mel_t, max_len=t_len) plot.plot_spectrogram( lin_p, os.path.join(eval_plot_dir, '{}-eval-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, mel_loss, stop_token_loss, eval_loss)
def plot_graph_and_save_audio(args, base_path=None, start_of_sentence=None, end_of_sentence=None, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, save_alignment=False, librosa_trim=False, attention_trim=False, time_str=None, isKorean=True): idx, (wav, alignment, path, text, sequence, mel) = args if base_path: plot_path = "{}/{}.png".format(base_path, get_time()) elif path: plot_path = path.rsplit('.', 1)[0] + ".png" else: plot_path = None if plot_path: plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) if use_short_concat: wav = short_concat(wav, alignment, text, start_of_sentence, end_of_sentence, pre_word_num, post_word_num, pre_surplus_idx, post_surplus_idx) if attention_trim and end_of_sentence: # attention이 text의 마지막까지 왔다면, 그 뒷부분은 버린다. end_idx_counter = 0 attention_argmax = alignment.argmax( 0 ) # alignment: text length(encoder), target length(decoder) ==> target length(decoder) end_idx = min(len(sequence) - 1, max(attention_argmax)) # max_counter = min((attention_argmax == end_idx).sum(), 5) + 1 # 20200612 위 로직을 보면 attention_argmax에서 end_idx랑 같은 값을 count한 거(실제 끝 값)랑 5를 min해서 max_counter를 정하게 되어 있다. # 한국말은 끝음을 오래 발음하는 경향이 있기 때문에 5로 자르지 않고 실제 발음한거만큼 끝까지 사용할 필요가 있어서 아래 로직으로 교체한다. # (설계자가 왜 5로 잘랐는지는 미지수) max_counter = (attention_argmax == end_idx).sum() for jdx, attend_idx in enumerate(attention_argmax): if len(attention_argmax) > jdx + 1: if attend_idx == end_idx: end_idx_counter += 1 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: break if end_idx_counter >= max_counter: break else: break spec_end_idx = hparams.reduction_factor * jdx + 3 wav = wav[:spec_end_idx] mel = mel[:spec_end_idx] audio_out = inv_linear_spectrogram(wav.T, hparams) if librosa_trim and end_of_sentence: yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50) audio_out = audio_out[:index[-1]] mel = mel[:index[-1] // hparams.hop_size] if save_alignment: alignment_path = "{}/{}.npy".format(base_path, idx) np.save(alignment_path, alignment, allow_pickle=False) if path or base_path: if path: current_path = add_postfix(path, idx) elif base_path: current_path = plot_path.replace(".png", ".wav") save_wav(audio_out, current_path, hparams.sample_rate) #hccho mel_path = current_path.replace(".wav", ".npy") np.save(mel_path, mel) #return True return audio_out else: io_out = io.BytesIO() save_wav(audio_out, io_out, hparams.sample_rate) result = io_out.getvalue() return audio_out
def synthesize(self, texts, basenames, log_dir, mel_filenames): hparams = self._hparams # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.synthesis_batch_size != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) sequences = [np.asarray(text_to_sequence(text)) for text in texts] input_lengths = [len(seq) for seq in sequences] seqs, max_seq_len = self._prepare_inputs(sequences) feed_dict = { self.inputs: seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32) } linears, mels, alignments, audio_length = self.session.run( [self.linear_outputs, self.mel_outputs, self.alignments[0], self.audio_length], feed_dict=feed_dict) # Natural batch synthesis # Get Mel/Linear lengths for the entire batch from stop_tokens predictions target_lengths = audio_length if basenames is None: # Generate wav and read it wav = audio.inv_mel_spectrogram(mels[0].T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way if platform.system() == 'Linux': # Linux wav reader os.system('aplay temp.wav') elif platform.system() == 'Windows': # windows wav reader os.system('start /min mplay32 /play /close temp.wav') else: raise RuntimeError( 'Your OS type is not supported yet, please add it to "centaur/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!') return for i, mel in enumerate(mels): if log_dir is not None: # save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) alignments_samples, alignment_titles = self.get_alignments(alignments) for idx in range(len(alignments_samples)): # save alignments plot.plot_alignment(alignments_samples[idx], os.path.join(log_dir, 'plots/{}.png'.format( alignment_titles[ idx])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) # save mel spectrogram plot plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate) # save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True, auto_aspect=True)