def save_states(global_step, mel_outputs, linear_outputs, attn, y, checkpoint_dir=None): idx = 1 # idx = np.random.randint(0, len(mel_outputs)) # Alignment path = os.path.join(checkpoint_dir, "step{}_alignment.png".format(global_step)) alignment = attn[idx].cpu().data.numpy( ) # alignment = attn[idx].cpu().data.numpy()[:, :input_length] plot_alignment(alignment.T, path, info="tacotron, step={}".format(global_step)) # Predicted spectrogram path = os.path.join(checkpoint_dir, "step{}_predicted_spectrogram.png".format(global_step)) linear_output = linear_outputs[idx].cpu().data.numpy() plot_spectrogram(linear_output, path) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) path = os.path.join(checkpoint_dir, "step{}_predicted.wav".format(global_step)) audio.save_wav(signal, path) # Target spectrogram path = os.path.join(checkpoint_dir, "step{}_target_spectrogram.png".format(global_step)) linear_output = y[idx].cpu().data.numpy() plot_spectrogram(linear_output, path)
def save_and_plot_fn(args, log_dir, step, loss, prefix): idx, (seq, spec, align) = args audio_path = os.path.join( log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx)) align_path = os.path.join( log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx)) waveform = inv_spectrogram(spec.T) save_audio(waveform, audio_path) info_text = 'step={:d}, loss={:.5f}'.format(step, loss) if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]: log('Training korean : Use jamo') plot.plot_alignment(align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=True), isKorean=True) else: log('Training non-korean : X use jamo') plot.plot_alignment(align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=False), isKorean=False)
def save_current_model(args, checkpoint_path, global_step, hparams, loss, model, plot_dir, saver, sess, step, wav_dir): # Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) input_seq, mel_prediction, linear_prediction, attention_mask_sample, targets_mel, target_length, linear_target = sess.run( [ model.inputs[0], model.post_net_predictions[0], model.mag_pred[0], model.alignments[0], model.targets_mel[0], model.targets_length[0], model.targets_mag[0], ]) alignments, alignment_titles = get_alignments(attention_mask_sample) # save griffin lim inverted wav for debug (linear -> wav) wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, '{}-linear.wav'.format(step)), sr=hparams.sample_rate) # Save real and predicted linear-spectrogram plot to disk (control purposes) plot.plot_spectrogram( linear_prediction, os.path.join(plot_dir, '{}-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=linear_target, max_len=target_length, auto_aspect=True) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, '{}-mel.wav'.format(step)), sr=hparams.sample_rate) # save alignment plot to disk (control purposes) for i in range(len(alignments)): plot.plot_alignment( alignments[i], os.path.join(plot_dir, '{}_{}-align.png'.format(step, alignment_titles[i])), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), max_len=target_length // hparams.reduction_factor) # save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, '{}-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=targets_mel, max_len=target_length) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))
def plot_result(self, mel_pred, mel_target, alig): os.makedirs(os.path.join(self.config['outdir'], 'plots'), exist_ok=True) plot_spectrogram(mel_pred, os.path.join(self.config['outdir'], 'plots', 'mel-before-{}.png'.format(self.steps)), target_spectrogram=mel_target) plot_alignment( alig, os.path.join(self.config['outdir'], 'plots', 'alig-{}.png'.format(self.steps)))
def save_and_plot_fn(args, log_dir, step, loss, prefix): idx, (seq, spec, align) = args audio_path = os.path.join( log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx)) align_path = os.path.join( log_dir, '{}-step-{:09d}-audio{:03d}.png'.format(prefix, step, idx)) waveform = inv_spectrogram(spec.T) save_audio(waveform, audio_path) info_text = 'step={:d}, loss={:.5f}'.format(step, loss) plot.plot_alignment( align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=True))
def train(log_dir, args): save_dir = os.path.join(log_dir, 'pretrained/') checkpoint_path = os.path.join(save_dir, 'model.ckpt') input_path = os.path.join(args.base_dir, args.input) plot_dir = os.path.join(log_dir, 'plots') os.makedirs(plot_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: step_count = 0 try: #simple text file to keep count of global step with open(os.path.join(log_dir, 'step_counter.txt'), 'r') as file: step_count = int(file.read()) except: print('no step_counter file found, assuming there is no saved checkpoint') global_step = tf.Variable(step_count, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = create_model(args.model, hparams) model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets) model.add_loss() model.add_optimizer(global_step) stats = add_stats(model) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: #Restore saved model if the user requested it, Default = True. try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e)) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: if not args.restore: log('Starting new training!') else: log('No model to load at {}'.format(save_dir)) #initiating feeder feeder.start_in_session(sess) #Training loop while not coord.should_stop(): start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r') if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step: {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0: with open(os.path.join(log_dir,'step_counter.txt'), 'w') as file: file.write(str(step)) log('Saving checkpoint to: {}-{}'.format(checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) # Unlike the original tacotron, we won't save audio # because we yet have to use wavenet as vocoder log('Saving alignement and Mel-Spectrograms..') input_seq, prediction, alignment, target = sess.run([model.inputs[0], model.mel_outputs[0], model.alignments[0], model.mel_targets[0], ]) #save predicted spectrogram to disk (for plot and manual evaluation purposes) mel_filename = 'ljspeech-mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(log_dir, mel_filename), prediction, allow_pickle=False) #save alignment plot to disk (control purposes) plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss)) #save real mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram(target, os.path.join(plot_dir, 'step-{}-real-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, Real'.format(args.model, time_string(), step, loss)) #save predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram(prediction, os.path.join(plot_dir, 'step-{}-pred-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss)) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
cmd = 'cp ' + 'vox/wav/' + '_'.join(k for k in fname) + '.wav ' + dst_dir + '/' + fname_original + '_original.wav' os.system(cmd) #text = ' '.join(k for k in line.decode("utf-8").split()[1:]) #text = '< ' + text + ' >' #text = [phids[l] for l in text.split()] text, qF0s = get_textNqF0s(line, phids) # Generating from original speaker spk = speakers_dict[fname[0]] waveform, alignment, _ = tts(model, text, spk, qF0s) fname_generated = '_'.join(k for k in fname[1:]) fname_generated = fname_generated + '_generated' dst_wav_path = join(dst_dir, "{}{}.wav".format(fname_generated, file_name_suffix)) dst_alignment_path = join(dst_dir, "{}_alignment.png".format(fname_generated)) plot_alignment(alignment.T, dst_alignment_path, info="tacotron, {}".format(checkpoint_path)) audio.save_wav(waveform, dst_wav_path) # Generating from a different speaker spk = np.random.randint(len(speakers)) #fname = fname.split('_') #fname[0] = ids2speakers[spk] fname_transferred = '_'.join(k for k in fname[1:]) fname_transferred = fname_transferred + '_transferred' print("I picked a random number as ", spk, " the corresponding speaker from the dictionary is ", ids2speakers[spk], " the filename I am storing is ", fname_transferred) print(text, fname_transferred) waveform, alignment, _ = tts(model, text, spk, qF0s) dst_wav_path = join(dst_dir, "{}{}.wav".format(fname_transferred, file_name_suffix)) dst_alignment_path = join(dst_dir, "{}_alignment.png".format(fname_transferred)) plot_alignment(alignment.T, dst_alignment_path, info="tacotron, {}".format(checkpoint_path))
def plot_graph_and_save_audio(args, base_path=None, start_of_sentence=None, end_of_sentence=None, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, use_manual_attention=False, save_alignment=False, librosa_trim=False, attention_trim=False, time_str=None, isKorean=True): idx, (wav, alignment, path, text, sequence) = args if base_path: plot_path = "{}/{}.png".format(base_path, get_time()) elif path: plot_path = path.rsplit('.', 1)[0] + ".png" else: plot_path = None #plot_path = add_prefix(plot_path, time_str) if use_manual_attention: plot_path = add_postfix(plot_path, "manual") if plot_path: plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) if use_short_concat: wav = short_concat(wav, alignment, text, start_of_sentence, end_of_sentence, pre_word_num, post_word_num, pre_surplus_idx, post_surplus_idx) if attention_trim and end_of_sentence: end_idx_counter = 0 attention_argmax = alignment.argmax(0) end_idx = min(len(sequence) - 1, max(attention_argmax)) max_counter = min((attention_argmax == end_idx).sum(), 5) for jdx, attend_idx in enumerate(attention_argmax): if len(attention_argmax) > jdx + 1: if attend_idx == end_idx: end_idx_counter += 1 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: break if end_idx_counter >= max_counter: break else: break spec_end_idx = hparams.reduction_factor * jdx + 3 wav = wav[:spec_end_idx] audio_out = inv_spectrogram(wav.T) if librosa_trim and end_of_sentence: yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50) audio_out = audio_out[:index[-1]] if save_alignment: alignment_path = "{}/{}.npy".format(base_path, idx) np.save(alignment_path, alignment, allow_pickle=False) if path or base_path: if path: current_path = add_postfix(path, idx) elif base_path: current_path = plot_path.replace(".png", ".wav") save_audio(audio_out, current_path) return True else: io_out = io.BytesIO() save_audio(audio_out, io_out) result = io_out.getvalue() return result
def plot_graph_and_save_audio(args, base_path=None, start_of_sentence=None, end_of_sentence=None, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, save_alignment=False, librosa_trim=False, attention_trim=False, time_str=None, isKorean=True, config=None): idx, (wav, alignment, path, text, sequence, mel) = args if base_path: plot_path = "{}/{}_{}.png".format(base_path, config.file.split('.')[0], idx) elif path: plot_path = path.rsplit('.', 1)[0] + ".png" else: plot_path = None if plot_path: plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) if attention_trim and end_of_sentence: # attention이 text의 마지막까지 왔다면, 그 뒷부분은 버린다. end_idx_counter = 0 attention_argmax = alignment.argmax( 0) # alignment: text length(encoder), target length(decoder) ==> target length(decoder) end_idx = min(len(sequence) - 1, max(attention_argmax)) max_counter = min((attention_argmax == end_idx).sum(), 5) for jdx, attend_idx in enumerate(attention_argmax): if len(attention_argmax) > jdx + 1: if attend_idx == end_idx: end_idx_counter += 1 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: break if end_idx_counter >= max_counter: break else: break spec_end_idx = hparams.reduction_factor * jdx + 3 wav = wav[:spec_end_idx] mel = mel[:spec_end_idx] audio_out = inv_linear_spectrogram(wav.T, hparams) if librosa_trim and end_of_sentence: yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50) audio_out = audio_out[:index[-1]] mel = mel[:index[-1] // hparams.hop_size] if save_alignment: alignment_path = "{}/{}.npy".format(base_path, idx) np.save(alignment_path, alignment, allow_pickle=False) if path or base_path: if path: current_path = add_postfix(path, idx) elif base_path: current_path = plot_path.replace(".png", ".wav") save_wav(audio_out, current_path, hparams.sample_rate) # hccho mel_path = current_path.replace(".wav", ".npy") np.save(mel_path, mel) return current_path else: io_out = io.BytesIO() save_wav(audio_out, io_out, hparams.sample_rate) result = io_out.getvalue() return io_out
def _train_epoch(self, dataloader=None): self.model.train() ll = len(dataloader) running_loss = 0.0 running_l1_loss = 0.0 running_ssim_loss = 0.0 running_att_loss = 0.0 pbar = tqdm(dataloader, unit="audios", unit_scale=dataloader.batch_size, \ disable=self.hparams.trainer.disable_progress_bar) for it, batch in enumerate(pbar, start=1): self.optimizer.zero_grad() mels, mlens, texts, tlens = \ batch['mels'], batch['mlens'].squeeze(1), batch['texts'].long(), batch['tlens'].squeeze(1) mels, mlens, texts, tlens = \ mels.to(self.device), mlens.to(self.device), texts.to(self.device), tlens.to(self.device) s = mels = self.normalizer(mels) # Spectrogram augmentation if self.hparams.duration.enable_augment: s = add_random_noise(mels, self.hparams.duration.noise) s = degrade_some(self.model, s, texts, tlens, \ self.hparams.duration.feed_ratio, repeat=self.hparams.duration.feed_repeat) s = frame_dropout(s, self.hparams.duration.replace_ratio) melspecs, attns = self.model((texts, tlens, s, True)) outputs_and_targets = (melspecs, mels, attns, mlens, tlens) loss, l1_loss, ssim_loss, att_loss = self.compute_metrics( outputs_and_targets) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1) self.optimizer.step() self.step += 1 loss, l1_loss, ssim_loss, att_loss = loss.item(), l1_loss.item( ), ssim_loss.item(), att_loss.item() running_loss += loss running_l1_loss += l1_loss running_ssim_loss += ssim_loss running_att_loss += att_loss # update the progress bar pbar.set_postfix({ 'l1': "%.05f" % (running_l1_loss / it), 'ssim': "%.05f" % (running_ssim_loss / it), 'att': "%.05f" % (running_att_loss / it) }) mels, melspecs, attns = mels.cpu().detach(), melspecs.cpu().detach( ), attns.cpu().detach() index = -1 mlen, tlen = mlens[index].item(), tlens[index].item() mels_fig = plot_spectrogram( melspecs[index, :mlen, :], target_spectrogram=mels[index, :mlen, :]) attn_fig = plot_alignment(attns[index, :mlen, :tlen]) self.loggers.log_step( 'train', self.step, { 'step_l1_loss': l1_loss, 'step_ssim_loss': ssim_loss, 'step_att_loss': att_loss }, { 'melspecs': mels_fig, 'attention': attn_fig }) epoch_loss = running_loss / ll epoch_l1_loss = running_l1_loss / ll epoch_ssim_loss = running_ssim_loss / ll epoch_att_loss = running_att_loss / ll return epoch_loss, epoch_l1_loss, epoch_ssim_loss, epoch_att_loss
def run_eval(args, eval_dir, eval_model, eval_plot_dir, eval_wav_dir, feeder, hparams, sess, step, summary_writer): # Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) sum_eval_loss = 0.0 sum_mel_loss = 0.0 sum_stop_token_loss = 0.0 sum_linear_loss = 0.0 count = 0.0 mel_p = None mel_t = None t_len = None attention_mask_sample = None lin_p = None lin_t = None for _ in tqdm(range(feeder.test_steps)): test_eloss, test_mel_loss, test_stop_token_loss, test_linear_loss, mel_p, mel_t, t_len, attention_mask_sample, lin_p, lin_t = sess.run( [ eval_model.loss, eval_model.mel_loss, eval_model.stop_token_loss, eval_model.linear_loss, eval_model.post_net_predictions[0], eval_model.targets_mel[0], eval_model.targets_length[0], eval_model.alignments[0], eval_model.mag_pred[0], eval_model.targets_mag[0], ]) sum_eval_loss += test_eloss sum_mel_loss += test_mel_loss sum_stop_token_loss += test_stop_token_loss sum_linear_loss += test_linear_loss count += 1.0 wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, '{}-eval-linear.wav'.format(step)), sr=hparams.sample_rate) if count > 0.0: eval_loss = sum_eval_loss / count mel_loss = sum_mel_loss / count stop_token_loss = sum_stop_token_loss / count linear_loss = sum_linear_loss / count else: eval_loss = sum_eval_loss mel_loss = sum_mel_loss stop_token_loss = sum_stop_token_loss linear_loss = sum_linear_loss log('Saving eval log to {}..'.format(eval_dir)) # Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, '{}-eval-mel.wav'.format(step)), sr=hparams.sample_rate) alignments, alignment_titles = get_alignments(attention_mask_sample) for i in range(len(alignments)): plot.plot_alignment(alignments[i], os.path.join( eval_plot_dir, '{}_{}-eval-align.png'.format( step, alignment_titles[i])), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), max_len=t_len // hparams.reduction_factor) plot.plot_spectrogram( mel_p, os.path.join(eval_plot_dir, '{}-eval-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=mel_t, max_len=t_len) plot.plot_spectrogram( lin_p, os.path.join(eval_plot_dir, '{}-eval-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, mel_loss, stop_token_loss, eval_loss)
def plot_graph_and_save_audio(args, base_path=None, start_of_sentence=None, end_of_sentence=None, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, save_alignment=False, librosa_trim=False, attention_trim=False, time_str=None, isKorean=True): idx, (wav, alignment, path, text, sequence, mel) = args if base_path: plot_path = "{}/{}.png".format(base_path, get_time()) elif path: plot_path = path.rsplit('.', 1)[0] + ".png" else: plot_path = None if plot_path: plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) if use_short_concat: wav = short_concat(wav, alignment, text, start_of_sentence, end_of_sentence, pre_word_num, post_word_num, pre_surplus_idx, post_surplus_idx) if attention_trim and end_of_sentence: # attention이 text의 마지막까지 왔다면, 그 뒷부분은 버린다. end_idx_counter = 0 attention_argmax = alignment.argmax( 0 ) # alignment: text length(encoder), target length(decoder) ==> target length(decoder) end_idx = min(len(sequence) - 1, max(attention_argmax)) # max_counter = min((attention_argmax == end_idx).sum(), 5) + 1 # 20200612 위 로직을 보면 attention_argmax에서 end_idx랑 같은 값을 count한 거(실제 끝 값)랑 5를 min해서 max_counter를 정하게 되어 있다. # 한국말은 끝음을 오래 발음하는 경향이 있기 때문에 5로 자르지 않고 실제 발음한거만큼 끝까지 사용할 필요가 있어서 아래 로직으로 교체한다. # (설계자가 왜 5로 잘랐는지는 미지수) max_counter = (attention_argmax == end_idx).sum() for jdx, attend_idx in enumerate(attention_argmax): if len(attention_argmax) > jdx + 1: if attend_idx == end_idx: end_idx_counter += 1 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: break if end_idx_counter >= max_counter: break else: break spec_end_idx = hparams.reduction_factor * jdx + 3 wav = wav[:spec_end_idx] mel = mel[:spec_end_idx] audio_out = inv_linear_spectrogram(wav.T, hparams) if librosa_trim and end_of_sentence: yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50) audio_out = audio_out[:index[-1]] mel = mel[:index[-1] // hparams.hop_size] if save_alignment: alignment_path = "{}/{}.npy".format(base_path, idx) np.save(alignment_path, alignment, allow_pickle=False) if path or base_path: if path: current_path = add_postfix(path, idx) elif base_path: current_path = plot_path.replace(".png", ".wav") save_wav(audio_out, current_path, hparams.sample_rate) #hccho mel_path = current_path.replace(".wav", ".npy") np.save(mel_path, mel) #return True return audio_out else: io_out = io.BytesIO() save_wav(audio_out, io_out, hparams.sample_rate) result = io_out.getvalue() return audio_out
def train(log_dir, args): checkpoint_path = os.path.join(hdfs_ckpts, log_dir, 'model.ckpt') log(hp.to_string(), is_print=False) log('Loading training data from: %s' % args.tfr_dir) log('Checkpoint path: %s' % checkpoint_path) log('Using model: sygst tacotron2') tf_dset = TFDataSet(hp, args.tfr_dir) feats = tf_dset.get_train_next() # Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) training = tf.placeholder_with_default(True, shape=(), name='training') with tf.name_scope('model'): model = Tacotron2SYGST(hp) model(feats['inputs'], mel_inputs=feats['mel_targets'], spec_inputs=feats['linear_targets'], spec_lengths=feats['spec_lengths'], ref_inputs=feats['mel_targets'], ref_lengths=feats['spec_lengths'], arousal_labels=feats['soft_arousal_labels'], valence_labels=feats['soft_valance_labels'], training=training) """ text_x, mel_x, spec_x, spec_len, aro, val = debug_data(2, 5, 10) model(text_x, mel_x, spec_x, spec_len, mel_x, spec_len, aro, val, training=training) """ model.add_loss() model.add_optimizer(global_step) stats = model.add_stats() # Bookkeeping: step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=2) # Train! config = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) if args.restore_step: # Restore from a checkpoint if the user requested it. restore_path = '%s-%s' % (checkpoint_path, args.restore_step) saver.restore(sess, restore_path) log('Resuming from checkpoint: %s' % restore_path, slack=True) else: log('Starting a new training run ...', slack=True) """ fetches = [global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss, model.stop_loss, model.arousal_loss, model.valence_loss, model.mel_grad_norms_max, model.spec_grad_norms_max, model.stop_grad_norms_max, model.aro_grad_norms_max, model.val_grad_norms_max] """ fetches = [ global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss, model.stop_loss, model.arousal_loss, model.valence_loss ] for _ in range(_max_step): start_time = time.time() sess.run(debug.get_ops()) # step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g = sess.run(fetches) step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss = sess.run( fetches) time_window.append(time.time() - start_time) loss_window.append(loss) """ message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f,mg=%.4f,spg=%.4f,sg=%.4f,ag=%.4f,vg=%.4f]' % ( step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g) """ message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f]' % ( step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss) log(message, slack=(step % args.checkpoint_interval == 0)) if loss > 100 or math.isnan(loss): log('Loss exploded to %.5f at step %d!' % (loss, step), slack=True) raise Exception('Loss Exploded') if step % args.summary_interval == 0: log('Writing summary at step: %d' % step) try: summary_writer.add_summary(sess.run(stats), step) except Exception as e: log(f'summary failed and ignored: {str(e)}') if step % args.checkpoint_interval == 0: log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) log('Saving audio and alignment...') gt_mel, gt_spec, seq, mel, spec, align = sess.run([ model.mel_targets[0], model.spec_targets[0], model.text_targets[0], model.mel_outputs[0], model.spec_outputs[0], model.alignment_outputs[0] ]) text = sequence_to_text(seq) wav = audio.inv_spectrogram(hp, spec.T) wav_path = os.path.join(log_dir, 'step-%d-audio.wav' % step) mel_path = os.path.join(log_dir, 'step-%d-mel.png' % step) spec_path = os.path.join(log_dir, 'step-%d-spec.png' % step) align_path = os.path.join(log_dir, 'step-%d-align.png' % step) info = '%s, %s, step=%d, loss=%.5f\n %s' % ( args.model, time_string(), step, loss, text) plot.plot_alignment(align, align_path, info=info) plot.plot_mel(mel, mel_path, info=info, gt_mel=gt_mel) plot.plot_mel(spec, spec_path, info=info, gt_mel=gt_spec) audio.save_wav(hp, wav, wav_path) log('Input: %s' % text) except Exception as e: log('Exiting due to exception: %s' % e, slack=True) traceback.print_exc()
def synthesize(self, texts, basenames, log_dir, mel_filenames): hparams = self._hparams # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.synthesis_batch_size != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) sequences = [np.asarray(text_to_sequence(text)) for text in texts] input_lengths = [len(seq) for seq in sequences] seqs, max_seq_len = self._prepare_inputs(sequences) feed_dict = { self.inputs: seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32) } linears, mels, alignments, audio_length = self.session.run( [self.linear_outputs, self.mel_outputs, self.alignments[0], self.audio_length], feed_dict=feed_dict) # Natural batch synthesis # Get Mel/Linear lengths for the entire batch from stop_tokens predictions target_lengths = audio_length if basenames is None: # Generate wav and read it wav = audio.inv_mel_spectrogram(mels[0].T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way if platform.system() == 'Linux': # Linux wav reader os.system('aplay temp.wav') elif platform.system() == 'Windows': # windows wav reader os.system('start /min mplay32 /play /close temp.wav') else: raise RuntimeError( 'Your OS type is not supported yet, please add it to "centaur/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!') return for i, mel in enumerate(mels): if log_dir is not None: # save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) alignments_samples, alignment_titles = self.get_alignments(alignments) for idx in range(len(alignments_samples)): # save alignments plot.plot_alignment(alignments_samples[idx], os.path.join(log_dir, 'plots/{}.png'.format( alignment_titles[ idx])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) # save mel spectrogram plot plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate) # save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True, auto_aspect=True)
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames): hparams = self._hparams # [-max, max] or [0,max] t2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.synthesis_batch_size != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) seqs = [np.asarray(text_to_sequence(text)) for text in texts] input_lengths = [len(seq) for seq in seqs] input_seqs, max_seq_len = self._prepare_inputs(seqs) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), } if self.gta: np_targets = [np.load(mel_filename) for mel_filename in mel_filenames] target_lengths = [len(np_target) for np_target in np_targets] target_seqs, max_target_len = self._prepare_targets(np_targets, self._hparams.outputs_per_step) feed_dict[self.targets] = target_seqs assert len(np_targets) == len(texts) linears = None if self.gta or not hparams.predict_linear: mels, alignments, stop_tokens = self.session.run( [self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict) # Natural batch synthesis # Get Mel lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) # Take off the batch wise padding mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] assert len(mels) == len(texts) else: linears, mels, alignments, stop_tokens = self.session.run( [self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict) # Natural batch synthesis # Get Mel/Linear lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) # Take off the batch wise padding mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)] linears = np.clip(linears, t2_output_range[0], t2_output_range[1]) assert len(mels) == len(linears) == len(texts) mels = np.clip(mels, t2_output_range[0], t2_output_range[1]) if basenames is None: # Generate wav and read it wav = audio.inv_mel_spectrogram(mels[0].T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way if platform.system() == 'Linux': # Linux wav reader os.system('aplay temp.wav') elif platform.system() == 'Windows': # windows wav reader os.system('start /min mplay32 /play /close temp.wav') else: raise RuntimeError( 'Your OS type is not supported yet, please add it to "synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!') return saved_mels_paths = [] for i, mel in enumerate(mels): # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) if log_dir is not None: # save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) # save alignments plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) # save mel spectrogram plot plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) if linears: # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate) # save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True, auto_aspect=True) return saved_mels_paths
for idx, line in enumerate(lines): fname = line.decode("utf-8").split()[0].zfill(8) cmd = 'cp vox/wav/' + fname + '.wav ' + dst_dir + '/' + fname + '_original.wav' print(cmd) os.system(cmd) text = ' '.join(k for k in line.decode("utf-8").split()[1:]) text = '< ' + text + ' >' print(text, fname) text = [phids[l] for l in text.split()] waveform, alignment, mel = tts(acousticmodel, text) waveform_vocoder = vocoder(vocoder_model, mel) print(waveform_vocoder.shape) dst_wav_path = join(dst_dir, "{}{}.wav".format(fname, file_name_suffix)) dst_alignment_path = join(dst_dir, "{}_alignment.png".format(fname)) plot_alignment( alignment.T, dst_alignment_path, info="tacotron, {}".format(checkpoint_path_acousticmodel)) audio.save_wav(waveform, dst_wav_path) dest_fname = fname + '_generated_vocoder' dst_wav_path = join( dst_dir, "{}{}.wav".format(dest_fname, file_name_suffix)) write(dst_wav_path, 16000, waveform_vocoder) print( "Finished! Check out {} for generated audio samples.".format(dst_dir)) sys.exit(0)