def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained/') checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.base_dir, args.tacotron_input) plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') os.makedirs(eval_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: #Restore saved model if the user requested it, Default = True. try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e)) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: if not args.restore: log('Starting new training!') else: log('No model to load at {}'.format(save_dir)) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r') if loss > 1000 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: #Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None if hparams.predict_linear: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p = sess.run( [eval_model.loss, eval_model.before_loss, eval_model.after_loss, eval_model.stop_token_loss, eval_model.linear_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0], eval_model.targets_lengths[0], eval_model.alignments[0], eval_model.linear_outputs[0]]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) linear_loss = sum(linear_losses) / len(linear_losses) wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-linear.wav'.format(step)), sr=hparams.sample_rate) else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run( [eval_model.loss, eval_model.before_loss, eval_model.after_loss, eval_model.stop_token_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0], eval_model.targets_lengths[0], eval_model.alignments[0]]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) stop_token_loss = sum(stop_token_losses) / len(stop_token_losses) log('Saving eval log to {}..'.format(eval_dir)) #Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-mel.wav'.format(step)), sr=hparams.sample_rate) plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eloss), max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, eloss), target_spectrogram=mel_t, max_len=t_len) log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) if step % args.checkpoint_interval == 0: #Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..') if hparams.predict_linear: input_seq, mel_prediction, linear_prediction, alignment, target, target_length = sess.run([ model.inputs[0], model.mel_outputs[0], model.linear_outputs[0], model.alignments[0], model.mel_targets[0], model.targets_lengths[0], ]) #save predicted linear spectrogram to disk (debug) linear_filename = 'linear-prediction-step-{}.npy'.format(step) np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (linear -> wav) wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) else: input_seq, mel_prediction, alignment, target, target_length = sess.run([model.inputs[0], model.mel_outputs[0], model.alignments[0], model.mel_targets[0], model.targets_lengths[0], ]) #save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) #save alignment plot to disk (control purposes) plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) #save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss), target_spectrogram=target, max_len=target_length) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps)) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e)) traceback.print_exc() coord.request_stop(e)
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') tensorboard_dir = os.path.join(log_dir, 'tacotron_events') meta_folder = os.path.join(log_dir, 'metas') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.base_dir, args.tacotron_input) if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) #Embeddings metadata char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv') if not os.path.isfile(char_embedding_meta): with open(char_embedding_meta, 'w', encoding='utf-8') as f: for symbol in symbols: if symbol == ' ': symbol = '\\s' #For visual purposes, swap space with \s f.write('{}\n'.format(symbol)) char_embedding_meta = char_embedding_meta.replace(log_dir, '..') #Potential Griffin-Lim GPU setup if hparams.GL_on_GPU: GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs') GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs') GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow( GLGPU_mel_inputs, hparams) GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow( GLGPU_lin_inputs, hparams) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=20) log('Tacotron training set to a maximum of {} steps'.format( args.tacotron_train_steps)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path), slack=True) ckpt = tf.train.load_checkpoint( checkpoint_state.model_checkpoint_path) variables = list( ckpt.get_variable_to_shape_map().keys()) #print('=====================PRINTING VARS===============================') #print(variables) #drop_source_layers = ['Tacotron_model/inference/inputs_embedding','Tacotron_model/Tacotron_model/inference/inputs_embedding/Adam_1','Tacotron_model/Tacotron_model/inference/inputs_embedding/Adam'] #for v in tf.global_variables(): # if not any(layer in v.op.name for layer in drop_source_layers): # print('Loading', v.op.name) # v.load(ckpt.get_tensor(v.op.name), session=sess) # Initialize all variables needed for DS, but not loaded from ckpt #init_op = tf.variables_initializer([v for v in tf.global_variables() if any(layer in v.op.name for layer in drop_source_layers)]) #sess.run(init_op) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log('No model to load at {}'.format(save_dir), slack=True) saver.save(sess, checkpoint_path, global_step=global_step) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) saver.save(sess, checkpoint_path, global_step=global_step) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: #Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None if hparams.predict_linear: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run( [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0], eval_model.tower_linear_targets[0][0], ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) linear_loss = sum(linear_losses) / len(linear_losses) if hparams.GL_on_GPU: wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: lin_p}) wav = audio.inv_preemphasis( wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_linear_spectrogram( lin_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format( step)), sr=hparams.sample_rate) else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run( [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) stop_token_loss = sum(stop_token_losses) / len( stop_token_losses) log('Saving eval log to {}..'.format(eval_dir)) #Save some log to monitor model improvement on same unseen sequence if hparams.GL_on_GPU: wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_p}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, 'step-{}-eval-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) plot.plot_alignment( align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram( mel_p, os.path.join( eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), target_spectrogram=mel_t, max_len=t_len) if hparams.predict_linear: plot.plot_spectrogram( lin_p, os.path.join( eval_plot_dir, 'step-{}-eval-linear-spectrogram.png'.format( step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) log('Eval loss for global step {}: {:.3f}'.format( step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300: #Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) if hparams.predict_linear: input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_linear_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], model.tower_linear_targets[0][0], ]) #save predicted linear spectrogram to disk (debug) linear_filename = 'linear-prediction-step-{}.npy'.format( step) np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (linear -> wav) if hparams.GL_on_GPU: wav = sess.run(GLGPU_lin_outputs, feed_dict={ GLGPU_lin_inputs: linear_prediction }) wav = audio.inv_preemphasis( wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_linear_spectrogram( linear_prediction.T, hparams) audio.save_wav( wav, os.path.join( wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) #Save real and predicted linear-spectrogram plot to disk (control purposes) plot.plot_spectrogram( linear_prediction, os.path.join( plot_dir, 'step-{}-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=linear_target, max_len=target_length, auto_aspect=True) else: input_seq, mel_prediction, alignment, target, target_length = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], ]) #save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (mel -> wav) if hparams.GL_on_GPU: wav = sess.run( GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_prediction}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_mel_spectrogram( mel_prediction.T, hparams) audio.save_wav( wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) #save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) #save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join( plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=target, max_len=target_length) log('Input at step {}: {}'.format( step, sequence_to_text(input_seq))) if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: #Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) #Update Projector log('\nSaving Model Character Embeddings visualization..') add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path) log('Tacotron Character embeddings have been updated on tensorboard!' ) log('Tacotron training complete after {} global steps!'.format( args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') tensorboard_dir = os.path.join(log_dir, 'tacotron_events') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = args.input_dir log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope: model = Tacotron(hparams) model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets, targets_lengths=feeder.targets_lengths, global_step=global_step, is_training=True, split_infos=feeder.split_infos) model.add_loss() model.add_optimizer(global_step) stats = _add_train_stats(model, hparams) GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs') GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(GLGPU_mel_inputs, hparams) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=20) log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log('No model to load at {}'.format(save_dir), slack=True) saver.save(sess, checkpoint_path, global_step=global_step) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) saver.save(sess, checkpoint_path, global_step=global_step) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if np.isnan(loss) or loss > 100.: log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step: {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300: #Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..') input_seq, mel_prediction = sess.run([ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], ]) #save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (mel -> wav) wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_prediction}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def train(log_dir, args, hparams, input_path): save_dir = os.path.join(log_dir, 'wave_pretrained') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') tensorboard_dir = os.path.join(log_dir, 'wavenet_events') meta_folder = os.path.join(log_dir, 'metas') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt') input_path = os.path.join(args.base_dir, input_path) log('Checkpoint_path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) # Start by setting a seed for repeatability tf.set_random_seed(hparams.wavenet_random_seed) # Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, args.base_dir, hparams) # Set up model global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) # Speaker Embeddings metadata if hparams.speakers_path is not None: speaker_embedding_meta = hparams.speakers_path else: speaker_embedding_meta = os.path.join(meta_folder, 'SpeakerEmbeddings.tsv') if not os.path.isfile(speaker_embedding_meta): with open(speaker_embedding_meta, 'w', encoding='utf-8') as f: for speaker in hparams.speakers: f.write('{}\n'.format(speaker)) speaker_embedding_meta = speaker_embedding_meta.replace(log_dir, '..') # book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) sh_saver = create_shadow_saver(model, global_step) log('Wavenet training set to a maximum of {} steps'.format(args.wavenet_train_steps)) # Memory allocation on the memory config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True run_init = False # Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) # saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True) load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path) else: log('No model to load at {}'.format(save_dir), slack=True) if hparams.wavenet_weight_normalization: run_init = True except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) if hparams.wavenet_weight_normalization: run_init = True if run_init: log( '\nApplying Weight normalization in fresh training. Applying data dependent initialization forward pass..') # Create init_model init_model, _ = model_train_mode(args, feeder, hparams, global_step, init=True) # initializing feeder feeder.start_threads(sess) if run_init: # Run one forward pass for model parameters initialization (make prediction on init_batch) _ = sess.run(init_model.tower_y_hat) log('Data dependent initialization done. Starting training!') # Training loop while not coord.should_stop() and step < args.wavenet_num_steps: start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if np.isnan(loss) or loss > 100: log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps: save_log(sess, step, model, plot_dir, wav_dir, hparams=hparams, model_name=args.model) save_checkpoint(sess, sh_saver, checkpoint_path, global_step) if step % args.eval_interval == 0: log('\nEvaluating at step {}'.format(step)) eval_step(sess, step, eval_model, eval_plot_dir, eval_wav_dir, summary_writer=summary_writer, hparams=model._hparams, model_name=args.model) if hparams.gin_channels > 0 and ( step % args.embedding_interval == 0 or step == args.wavenet_train_steps or step == 1): # Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) # Update Projector log('\nSaving Model Speaker Embeddings visualization..') add_embedding_stats(summary_writer, [model.embedding_table.name], [speaker_embedding_meta], checkpoint_state.model_checkpoint_path) log('WaveNet Speaker embeddings have been updated on tensorboard!') log('Wavenet training complete after {} global steps'.format(args.wavenet_train_steps), slack=True) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def train(log_dir, args): save_dir = os.path.join(log_dir, 'pretrained/') checkpoint_path = os.path.join(save_dir, 'model.ckpt') input_path = os.path.join(args.base_dir, args.input) plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: step_count = 0 try: #simple text file to keep count of global step with open(os.path.join(log_dir, 'step_counter.txt'), 'r') as file: step_count = int(file.read()) except: print( 'no step_counter file found, assuming there is no saved checkpoint' ) global_step = tf.Variable(step_count, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = create_model(args.model, hparams) if hparams.predict_linear: model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets, feeder.linear_targets) else: model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets) model.add_loss() model.add_optimizer(global_step) stats = add_stats(model) #Book keeping step = 0 save_step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: #Restore saved model if the user requested it, Default = True. try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e)) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: if not args.restore: log('Starting new training!') else: log('No model to load at {}'.format(save_dir)) #initiating feeder feeder.start_in_session(sess) #Training loop while not coord.should_stop(): start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r') if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step: {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0: with open(os.path.join(log_dir, 'step_counter.txt'), 'w') as file: file.write(str(step)) log('Saving checkpoint to: {}-{}'.format( checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) save_step = step log('Saving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) if hparams.predict_linear: input_seq, mel_prediction, linear_prediction, alignment, target = sess.run( [ model.inputs[0], model.mel_outputs[0], model.linear_outputs[0], model.alignments[0], model.mel_targets[0], ]) #save predicted linear spectrogram to disk (debug) linear_filename = 'linear-prediction-step-{}.npy'.format( step) np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (linear -> wav) wav = audio.inv_linear_spectrogram(linear_prediction.T) audio.save_wav( wav, os.path.join( wav_dir, 'step-{}-waveform-linear.wav'.format(step))) else: input_seq, mel_prediction, alignment, target = sess.run( [ model.inputs[0], model.mel_outputs[0], model.alignments[0], model.mel_targets[0], ]) #save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T) audio.save_wav( wav, os.path.join(wav_dir, 'step-{}-waveform-mel.wav'.format(step))) #save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss)) #save real mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( target, os.path.join( plot_dir, 'step-{}-real-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, Real'.format( args.model, time_string(), step, loss)) #save predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join( plot_dir, 'step-{}-pred-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format( args.model, time_string(), step, loss)) log('Input at step {}: {}'.format( step, sequence_to_text(input_seq))) except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def train(log_dir, args, hparams, input_path): save_dir = os.path.join(log_dir, 'wave_pretrained/') eval_dir = os.path.join(log_dir, 'eval-dir') audio_dir = os.path.join(log_dir, 'wavs') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') eval_audio_dir = os.path.join(eval_dir, 'wavs') eval_plot_dir = os.path.join(eval_dir, 'plots') checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt') input_path = os.path.join(args.base_dir, input_path) os.makedirs(save_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(audio_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(eval_audio_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) log('Checkpoint_path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.wavenet_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, args.base_dir, hparams) #Set up model global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) #book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) sh_saver = create_shadow_saver(model, global_step) log('Wavenet training set to a maximum of {} steps'.format( args.wavenet_train_steps)) #Memory allocation on the memory config = tf.ConfigProto() config.gpu_options.allow_growth = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: #Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e)) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path)) load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path) else: if not args.restore: log('Starting new training!') else: log('No model to load at {}'.format(save_dir)) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.wavenet_train_steps: start_time = time.time() step, y_hat, loss, opt = sess.run( [global_step, model.y_hat, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r') if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps: save_log(sess, step, model, plot_dir, audio_dir, hparams=hparams) save_checkpoint(sess, sh_saver, checkpoint_path, global_step) if step % args.eval_interval == 0: log('\nEvaluating at step {}'.format(step)) eval_step(sess, step, eval_model, eval_plot_dir, eval_audio_dir, summary_writer=summary_writer, hparams=model._hparams) log('Wavenet training complete after {} global steps'.format( args.wavenet_train_steps)) return save_dir except Exception as e: log('Exiting due to Exception: {}'.format(e))
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') tensorboard_dir = os.path.join(log_dir, 'tacotron_events') meta_folder = os.path.join(log_dir, 'metas') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.base_dir, args.tacotron_input) if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) # Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) # Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) # with open("split_train.txt", "w") as file: # for line in feeder._train_meta: # for k in range(len(line)-1): # file.write(line[k]+"|") # file.write(line[-1]+"\n") # with open("split_validation.txt", "w") as file: # for line in feeder._test_meta: # for k in range(len(line)-1): # file.write(line[k]+"|") # file.write(line[-1]+"\n") # print("Feeder init done !") # assert False # Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) # TODO Visualize embeddings # Embeddings inputs metadata char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv') if not os.path.isfile(char_embedding_meta): with open(char_embedding_meta, 'w', encoding='utf-8') as f: for symbol in symbols: if symbol == ' ': symbol = '\\s' # For visual purposes, swap space with \s f.write('{}\n'.format(symbol)) char_embedding_meta = char_embedding_meta.replace(log_dir, '..') # # Embeddings speaker metadata # speaker_embedding_meta = os.path.join(meta_folder, 'SpeakerEmbeddings.tsv') # if not os.path.isfile(speaker_embedding_meta): # with open(speaker_embedding_meta, 'w', encoding='utf-8') as f: # f.write("Filename\tSpeaker\n") # for description in feeder._metadata: # f.write('{}\t{}\n'.format(description[1], description[-1])) # speaker_embedding_meta = speaker_embedding_meta.replace(log_dir, '..') # Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) log('Tacotron training set to a maximum of {} steps'.format( args.tacotron_train_steps)) # Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True # Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) # saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path), slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log('No model to load at {}'.format(save_dir), slack=True) saver.save(sess, checkpoint_path, global_step=global_step) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) saver.save(sess, checkpoint_path, global_step=global_step) # initializing feeder feeder.start_threads(sess) # Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: # Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None speaker_losses = [] speaker_loss = None eval_run = [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] ] if hparams.predict_linear: eval_run.append(eval_model.tower_linear_loss[0]) eval_run.append(eval_model.tower_linear_outputs[0][0]) eval_run.append(eval_model.tower_linear_targets[0][0]) if hparams.tacotron_multi_speaker: eval_run.append(eval_model.tower_speaker_loss[0]) for i in tqdm(range(feeder.test_steps)): blob = sess.run(eval_run) eloss = blob[0] before_loss = blob[1] after_loss = blob[2] stop_token_loss = blob[3] mel_p = blob[4] mel_t = blob[5] t_len = blob[6] align = blob[7] if hparams.predict_linear: linear_loss = blob[8] lin_p = blob[9] lin_t = blob[10] if hparams.tacotron_multi_speaker: speaker_p = blob[11] eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) if hparams.predict_linear: linear_losses.append(linear_loss) if hparams.tacotron_multi_speaker: speaker_losses.append(speaker_p) if hparams.predict_linear: linear_loss = sum(linear_losses) / len(linear_losses) wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format( step)), sr=hparams.sample_rate) if hparams.tacotron_multi_speaker: speaker_loss = sum(speaker_losses) / len( speaker_losses) # if hparams.predict_linear: # for i in tqdm(range(feeder.test_steps)): # eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run( # [ # eval_model.tower_loss[0], eval_model.tower_before_loss[0], # eval_model.tower_after_loss[0], # eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], # eval_model.tower_mel_outputs[0][0], # eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], # eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0], # eval_model.tower_linear_targets[0][0], # ]) # eval_losses.append(eloss) # before_losses.append(before_loss) # after_losses.append(after_loss) # stop_token_losses.append(stop_token_loss) # linear_losses.append(linear_loss) # # print("len(eval_loss) : {}".format(len(eval_loss))) # # print("len(before_losses) : {}".format(len(before_losses))) # # print("len(after_losses) : {}".format(len(after_losses))) # # print("len(stop_token_losses) : {}".format(len(stop_token_losses))) # # print("len(linear_losses) : {}".format(len(linear_losses))) # # print("division par : {}, dans hparams.predict_linear".format(len(linear_losses))) # linear_loss = sum(linear_losses) / len(linear_losses) # # wav = audio.inv_linear_spectrogram(lin_p.T, hparams) # audio.save_wav(wav, # os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format(step)), # sr=hparams.sample_rate) # # else: # for i in tqdm(range(feeder.test_steps)): # eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run([ # eval_model.tower_loss[0], eval_model.tower_before_loss[0], # eval_model.tower_after_loss[0], # eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0], # eval_model.tower_mel_targets[0][0], # eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] # ]) # eval_losses.append(eloss) # before_losses.append(before_loss) # after_losses.append(after_loss) # stop_token_losses.append(stop_token_loss) # print("len(eval_loss) : {}".format(len(eval_loss))) # print("len(before_losses) : {}".format(len(before_losses))) # print("len(after_losses) : {}".format(len(after_losses))) # print("len(stop_token_losses) : {}".format(len(stop_token_losses))) eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) stop_token_loss = sum(stop_token_losses) / len( stop_token_losses) log('Saving eval log to {}..'.format(eval_dir)) # Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, 'step-{}-eval-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) plot.plot_alignment( align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram( mel_p, os.path.join( eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), target_spectrogram=mel_t, max_len=t_len) if hparams.predict_linear: plot.plot_spectrogram( lin_p, os.path.join( eval_plot_dir, 'step-{}-eval-linear-spectrogram.png'.format( step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) log('Eval loss for global step {}: {:.3f}'.format( step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss, speaker_loss) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300: # Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) if hparams.predict_linear: input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_linear_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], model.tower_linear_targets[0][0], ]) # save predicted linear spectrogram to disk (debug) linear_filename = 'linear-prediction-step-{}.npy'.format( step) np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) # save griffin lim inverted wav for debug (linear -> wav) wav = audio.inv_linear_spectrogram( linear_prediction.T, hparams) audio.save_wav( wav, os.path.join( wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) # Save real and predicted linear-spectrogram plot to disk (control purposes) plot.plot_spectrogram( linear_prediction, os.path.join( plot_dir, 'step-{}-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=linear_target, max_len=target_length, auto_aspect=True) else: input_seq, mel_prediction, alignment, target, target_length = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], ]) # save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav( wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) # save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) # save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join( plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=target, max_len=target_length) # TODO Find a way to revert encoded IPA to original IPA or original text # log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: #Get current checkpoint_backup state # checkpoint_state = tf.train.get_checkpoint_state(save_dir) checkpoint_state = tf.train.get_checkpoint_state(save_dir) # TODO Visualize embeddings #Update Projector log('\nSaving Model Character Embeddings visualization..') # add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path) # add_embedding_stats(summary_writer, [model.embedding_speaker.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path) log('Tacotron Character embeddings have been updated on tensorboard!' ) log('Tacotron training complete after {} global steps!'.format( args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def train(log_dir, config): config.data_paths = config.data_paths data_dirs = [os.path.join(data_path, "data") \ for data_path in config.data_paths] num_speakers = len(data_dirs) config.num_test = config.num_test_per_speaker * num_speakers if num_speakers > 1 and hparams.model_type not in ["deepvoice", "simple"]: raise Exception("[!] Unkown model_type for multi-speaker: {}".format( config.model_type)) commit = get_git_commit() if config.git else 'None' checkpoint_path = os.path.join(log_dir, 'model.ckpt') log(' [*] git recv-parse HEAD:\n%s' % get_git_revision_hash()) log('=' * 50) #log(' [*] dit diff:\n%s' % get_git_diff()) log('=' * 50) log(' [*] Checkpoint path: %s' % checkpoint_path) log(' [*] Loading training data from: %s' % data_dirs) log(' [*] Using model: %s' % config.model_dir) log(hparams_debug_string()) # Set up DataFeeder: coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: train_feeder = DataFeeder(coord, data_dirs, hparams, config, 32, data_type='train', batch_size=hparams.batch_size) test_feeder = DataFeeder(coord, data_dirs, hparams, config, 8, data_type='test', batch_size=config.num_test) # Set up model: is_randomly_initialized = config.initialize_path is None global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = create_model(hparams) model.initialize(train_feeder.inputs, train_feeder.input_lengths, num_speakers, train_feeder.speaker_id, train_feeder.mel_targets, train_feeder.linear_targets, train_feeder.loss_coeff, is_randomly_initialized=is_randomly_initialized) model.add_loss() model.add_optimizer(global_step) train_stats = add_stats(model, scope_name='stats') # legacy with tf.variable_scope('model', reuse=True) as scope: test_model = create_model(hparams) test_model.initialize(test_feeder.inputs, test_feeder.input_lengths, num_speakers, test_feeder.speaker_id, test_feeder.mel_targets, test_feeder.linear_targets, test_feeder.loss_coeff, rnn_decoder_test_mode=True, is_randomly_initialized=is_randomly_initialized) test_model.add_loss() test_stats = add_stats(test_model, model, scope_name='test') test_stats = tf.summary.merge([test_stats, train_stats]) # Bookkeeping: step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=None, keep_checkpoint_every_n_hours=2) sess_config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) sess_config.gpu_options.allow_growth = True # Train! #with tf.Session(config=sess_config) as sess: with tf.Session() as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) if config.load_path: # Restore from a checkpoint if the user requested it. restore_path = get_most_recent_checkpoint(config.model_dir) saver.restore(sess, restore_path) log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True) elif config.initialize_path: restore_path = get_most_recent_checkpoint( config.initialize_path) saver.restore(sess, restore_path) log('Initialized from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True) zero_step_assign = tf.assign(global_step, 0) sess.run(zero_step_assign) start_step = sess.run(global_step) log('=' * 50) log(' [*] Global step is reset to {}'. \ format(start_step)) log('=' * 50) else: log('Starting new training run at commit: %s' % commit, slack=True) start_step = sess.run(global_step) train_feeder.start_in_session(sess, start_step) test_feeder.start_in_session(sess, start_step) while not coord.should_stop(): start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss_without_coeff, model.optimize], feed_dict=model.get_dummy_feed_dict()) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % ( step, time_window.average, loss, loss_window.average) log(message, slack=(step % config.checkpoint_interval == 0)) if loss > 100 or math.isnan(loss): log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True) raise Exception('Loss Exploded') if step % config.summary_interval == 0: log('Writing summary at step: %d' % step) feed_dict = { **model.get_dummy_feed_dict(), **test_model.get_dummy_feed_dict() } summary_writer.add_summary( sess.run(test_stats, feed_dict=feed_dict), step) if step % config.checkpoint_interval == 0: log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) if step % config.test_interval == 0: log('Saving audio and alignment...') num_test = config.num_test fetches = [ model.inputs[:num_test], model.linear_outputs[:num_test], model.alignments[:num_test], test_model.inputs[:num_test], test_model.linear_outputs[:num_test], test_model.alignments[:num_test], ] feed_dict = { **model.get_dummy_feed_dict(), **test_model.get_dummy_feed_dict() } sequences, spectrograms, alignments, \ test_sequences, test_spectrograms, test_alignments = \ sess.run(fetches, feed_dict=feed_dict) save_and_plot(sequences[:1], spectrograms[:1], alignments[:1], log_dir, step, loss, "train") save_and_plot(test_sequences, test_spectrograms, test_alignments, log_dir, step, loss, "test") except Exception as e: log('Exiting due to exception: %s' % e, slack=True) traceback.print_exc() coord.request_stop(e)
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained/') checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.base_dir, args.tacotron_input) plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') os.makedirs(eval_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: #Restore saved model if the user requested it, Default = True. try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e)) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: if not args.restore: log('Starting new training!') else: log('No model to load at {}'.format(save_dir)) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r') if np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: #Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None if hparams.predict_linear: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p = sess.run( [eval_model.loss, eval_model.before_loss, eval_model.after_loss, eval_model.stop_token_loss, eval_model.linear_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0], eval_model.targets_lengths[0], eval_model.alignments[0], eval_model.linear_outputs[0]]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) linear_loss = sum(linear_losses) / len(linear_losses) wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-linear.wav'.format(step)), sr=hparams.sample_rate) else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run( [eval_model.loss, eval_model.before_loss, eval_model.after_loss, eval_model.stop_token_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0], eval_model.targets_lengths[0], eval_model.alignments[0]]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) stop_token_loss = sum(stop_token_losses) / len(stop_token_losses) log('Saving eval log to {}..'.format(eval_dir)) #Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-mel.wav'.format(step)), sr=hparams.sample_rate) plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eloss), max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, eloss), target_spectrogram=mel_t, max_len=t_len) log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) if step % args.checkpoint_interval == 0: #Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..') if hparams.predict_linear: input_seq, mel_prediction, linear_prediction, alignment, target, target_length = sess.run([ model.inputs[0], model.mel_outputs[0], model.linear_outputs[0], model.alignments[0], model.mel_targets[0], model.targets_lengths[0], ]) #save predicted linear spectrogram to disk (debug) linear_filename = 'linear-prediction-step-{}.npy'.format(step) np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (linear -> wav) wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) else: input_seq, mel_prediction, alignment, target, target_length = sess.run([model.inputs[0], model.mel_outputs[0], model.alignments[0], model.mel_targets[0], model.targets_lengths[0], ]) #save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) #save alignment plot to disk (control purposes) plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) #save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss), target_spectrogram=target, max_len=target_length) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps)) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e)) traceback.print_exc() coord.request_stop(e)
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained') wav_plot = os.path.join(log_dir, 'wav_plot') tensorboard_dir = os.path.join(log_dir, 'tacotron_events') meta_folder = os.path.join(log_dir, 'metas') os.makedirs(save_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) os.makedirs(wav_plot, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.data_dir, args.tacotron_input) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) #Embeddings metadata char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv') if not os.path.isfile(char_embedding_meta): with open(char_embedding_meta, 'w', encoding='utf-8') as f: for symbol in symbols: if symbol == ' ': symbol = '\\s' #For visual purposes, swap space with \s f.write('{}\n'.format(symbol)) char_embedding_meta = char_embedding_meta.replace(log_dir, '..') #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=20) log('Tacotron training set to a maximum of {} steps'.format( args.tacotron_train_steps)) #Memory allocation on the GPU as needed ''' config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True ''' #Train with tf.Session() as sess: #config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): saver.restore(sess, checkpoint_state.model_checkpoint_path) #initial_global_step = global_step.assign(0) #sess.run(initial_global_step) else: log('No model to load at {}'.format(save_dir), slack=True) saver.save(sess, checkpoint_path, global_step=global_step) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) saver.save(sess, checkpoint_path, global_step=global_step) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt, before_loss, after_loss, token_loss, reg_loss = sess.run( [ global_step, model.loss, model.optimize, model.before_loss, model.after_loss, model.stop_token_loss, model.regularization_loss ]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step{:6d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}, mel_before={:.5f}, mel_after={:.5f}, token_loss={:.5f}, reg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average, before_loss, after_loss, token_loss, reg_loss) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if np.isnan(loss) or loss > 100.: log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300: #Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) input_seq, mel_prediction, alignment, target, target_length = sess.run( [ model.inputs[0], model.mel_outputs[0], model.alignments[0], model.mel_targets[0], model.targets_lengths[0], ]) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav( wav, os.path.join(wav_plot, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) #save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(wav_plot, 'step-{}-align.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) #save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join( wav_plot, 'step-{}-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=target, max_len=target_length) print(', '.join(map(str, input_seq.tolist()))) log('Input at step {}: {}'.format( step, sequence_to_text(input_seq))) if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: #Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) #Update Projector log('\nSaving Model Character Embeddings visualization..') add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path) log('Tacotron Character embeddings have been updated on tensorboard!' ) log('Tacotron training complete after {} global steps!'.format( args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') feat_dir = os.path.join(log_dir, 'features') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') tensorboard_dir = os.path.join(log_dir, 'tacotron_events') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(feat_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.base_dir, args.tacotron_input) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=1) log('Tacotron training set to a maximum of {} steps'.format( args.tacotron_train_steps)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path), slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log('No model to load at {}'.format(save_dir), slack=True) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: #Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] attention_losses = [] for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, attention_loss, feature_prediction, target_len, align = sess.run( [ eval_model.loss, eval_model.before_loss, eval_model.after_loss, eval_model.stop_token_loss, eval_model.attention_loss, eval_model.final_outputs[0], eval_model.targets_lengths[0], eval_model.alignments[0] ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) attention_losses.append(attention_loss) eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) stop_token_loss = sum(stop_token_losses) / len( stop_token_losses) attention_loss = sum(attention_losses) / len( attention_losses) log('Saving eval log to {}..'.format(eval_dir)) #Save some log to monitor model improvement on same unseen sequence wav = audio.synthesize(feature_prediction, hparams) audio.save_wav( wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform.wav'.format(step)), hparams) plot.plot_alignment( align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), max_len=target_len // hparams.outputs_per_step) log('Eval loss for global step {}: {:.3f}'.format( step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, before_loss, after_loss, stop_token_loss, attention_loss, eval_loss) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps: #Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) graph_def = tf.graph_util.convert_variables_to_constants( sess, sess.graph_def, ['model/inference/add']) tf.train.write_graph(sess.graph_def, save_dir, 'graph.pb', as_text=False) log('\nSaving alignment and World vocoder synthesized waveform..' ) input_seq, feature_prediction, alignment, target_length = sess.run( [ model.inputs[0], model.final_outputs[0], model.alignments[0], model.targets_lengths[0] ]) #save World vocoder waveform for debug wav = audio.synthesize(feature_prediction, hparams) audio.save_wav( wav, os.path.join(wav_dir, 'step-{}.wav'.format(step)), hparams) #save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) log('Input at step {}: {}'.format( step, sequence_to_text(input_seq))) log('Tacotron training complete after {} global steps!'.format( args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
saver.save(sess, checkpoint_path, global_step=global_step) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) saver.save(sess, checkpoint_path, global_step=global_step) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: #Run eval and save eval stats
def train(log_dir, args, input): commit = get_git_commit() if args.git else 'None' checkpoint_path = os.path.join(log_dir, 'model.ckpt') input_path = os.path.join(args.base_dir, input) log('Checkpoint path: %s' % checkpoint_path) log('Loading training data from: %s' % input_path) log('Using model: %s' % args.variant) log(hparams_debug_string()) # Set up DataFeeder: coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: if args.eal_dir: from tacotron.datafeeder import DataFeeder_EAL feeder = DataFeeder_EAL(coord, input_path, hparams, args.eal_dir) else: from tacotron.datafeeder import DataFeeder feeder = DataFeeder(coord, input_path, hparams) # Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = create_model(args.variant, hparams) if args.eal_dir: model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets, feeder.pml_targets, is_training=True, eal=True, locked_alignments=feeder.locked_alignments, flag_trainAlign=args.eal_trainAlign, flag_trainJoint=args.eal_trainJoint, alignScale=args.eal_alignScale) else: model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets, feeder.pml_targets, is_training=True, gta=True) model.add_loss() model.add_optimizer(global_step) stats = add_stats(model, eal_dir=args.eal_dir) # Bookkeeping: step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) # Set up fixed alignment synthesizer alignment_synth = AlignmentSynthesizer() # Set up text for synthesis fixed_sentence = 'Scientists at the CERN laboratory say they have discovered a new particle.' # Set up denormalisation parameters for synthesis mean_path = os.path.abspath(os.path.join(args.base_dir, input, '..', 'pml_data/mean.dat')) std_path = os.path.abspath(os.path.join(args.base_dir, input, '..', 'pml_data/std.dat')) log('Loading normalisation mean from: {}'.format(mean_path)) log('Loading normalisation standard deviation from: {}'.format(std_path)) mean_norm = None std_norm = None if os.path.isfile(mean_path) and os.path.isfile(std_path): mean_norm = np.fromfile(mean_path, 'float32') std_norm = np.fromfile(std_path, 'float32') # Train! # import pdb # flag_pdb = False # pdb.set_trace() # args.checkpoint_interval = 2 # args.num_steps = 5 with tf.Session() as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) # pdb.set_trace() if args.restore_step: # Restore from a checkpoint if the user requested it. restore_path = '%s-%d' % (checkpoint_path, args.restore_step) saver.restore(sess, restore_path) log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True) elif args.eal_dir and args.eal_ckpt: if args.eal_trainAlign or args.eal_trainJoint: list_var = tf.trainable_variables() + [v for v in tf.global_variables() if 'moving' in v.name] saver_eal = tf.train.Saver(list_var) saver_eal.restore(sess, args.eal_ckpt) log('Loaded weights and batchNorm cache of checkpoint: %s at commit: %s' % (args.eal_ckpt, commit), slack=True) elif args.eal_ft: saver.restore(sess, args.eal_ckpt) log('Refining the model from checkpoint: %s at commit: %s' % (args.eal_ckpt, commit), slack=True) else: list_var = [var for var in tf.global_variables() if 'optimizer' not in var.name] saver_eal = tf.train.Saver(list_var) saver_eal.restore(sess, args.eal_ckpt) log('Initializing the weights from checkpoint: %s at commit: %s' % (args.eal_ckpt, commit), slack=True) # args.num_steps *= 2 # sess.run(global_step.assign(0)) else: log('Starting new training run at commit: %s' % commit, slack=True) feeder.start_in_session(sess) step = 0 # initialise step variable so can use in while condition while not coord.should_stop() and step <= args.num_steps: # pdb.set_trace() start_time = time.time() if args.eal_trainAlign: step, loss, loss_align, opt = sess.run([global_step, model.loss, model.loss_align, model.optimize]) # try: # step, loss, loss_align, opt, tmp_a, tmp_ar = sess.run([global_step, model.loss, model.loss_align, model.optimize, # model.alignments, model.alignments_ref]) # except: # print("Oops!",sys.exc_info()[0],"occured.") # flag_pdb = True # if flag_pdb or np.isnan(loss_align): # pdb.set_trace() # flag_pdb = False time_window.append(time.time() - start_time) loss_window.append(loss_align) message = 'Step %-7d [%.03f sec/step, loss=%.05f, loss_align=%.05f, avg_loss_align=%.05f]' % ( step, time_window.average, loss, loss_align, loss_window.average) elif args.eal_trainJoint: step, loss, loss_align, loss_joint, opt = sess.run([global_step, model.loss, model.loss_align, model.loss_joint, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss_joint) message = 'Step %-7d [%.03f sec/step, loss=%.05f, loss_align=%.05f, avg_loss_joint=%.05f]' % ( step, time_window.average, loss, loss_align, loss_window.average) else: step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % ( step, time_window.average, loss, loss_window.average) log(message, slack=(step % args.checkpoint_interval == 0)) if loss > 100 or math.isnan(loss): log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True) raise Exception('Loss Exploded') if step % args.summary_interval == 0: log('Writing summary at step: %d' % step) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0: log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) log('Saving audio and alignment...') summary_elements = [] # if the model has linear spectrogram features, use them to synthesize audio if hasattr(model, 'linear_targets'): input_seq, alignment, target_spectrogram, spectrogram = sess.run([ model.inputs[0], model.alignments[0], model.linear_targets[0], model.linear_outputs[0]]) output_waveform = audio.inv_spectrogram(spectrogram.T) target_waveform = audio.inv_spectrogram(target_spectrogram.T) audio.save_wav(output_waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step)) audio.save_wav(target_waveform, os.path.join(log_dir, 'step-%d-target-audio.wav' % step)) # otherwise, synthesize audio from PML vocoder features elif hasattr(model, 'pml_targets'): input_seq, alignment, target_pml_features, pml_features = sess.run([ model.inputs[0], model.alignments[0], model.pml_targets[0], model.pml_outputs[0]]) cfg = Configuration(hparams.sample_rate, hparams.pml_dimension) synth = PMLSynthesizer(cfg) output_waveform = synth.pml_to_wav(pml_features, mean_norm=mean_norm, std_norm=std_norm, spec_type=hparams.spec_type) target_waveform = synth.pml_to_wav(target_pml_features, mean_norm=mean_norm, std_norm=std_norm, spec_type=hparams.spec_type) sp.wavwrite(os.path.join(log_dir, 'step-%d-target-audio.wav' % step), target_waveform, hparams.sample_rate, norm_max_ifneeded=True) sp.wavwrite(os.path.join(log_dir, 'step-%d-audio.wav' % step), output_waveform, hparams.sample_rate, norm_max_ifneeded=True) # we need to adjust the output and target waveforms so the values lie in the interval [-1.0, 1.0] output_waveform /= 1.05 * np.max(np.abs(output_waveform)) target_waveform /= 1.05 * np.max(np.abs(target_waveform)) summary_elements.append( tf.summary.audio('ideal-%d' % step, np.expand_dims(target_waveform, 0), hparams.sample_rate), ) summary_elements.append( tf.summary.audio('sample-%d' % step, np.expand_dims(output_waveform, 0), hparams.sample_rate), ) # get the alignment for the top sentence in the batch random_attention_plot = plot.plot_alignment(alignment, os.path.join(log_dir, 'step-%d-random-align.png' % step), info='%s, %s, %s, step=%d, loss=%.5f' % ( args.variant, commit, time_string(), step, loss)) summary_elements.append( tf.summary.image('attention-%d' % step, random_attention_plot), ) # also process the alignment for a fixed sentence for comparison alignment_synth.load('%s-%d' % (checkpoint_path, step), hparams, model_name=args.variant) fixed_alignment = alignment_synth.synthesize(fixed_sentence) fixed_attention_plot = plot.plot_alignment(fixed_alignment, os.path.join(log_dir, 'step-%d-fixed-align.png' % step), info='%s, %s, %s, step=%d, loss=%.5f' % ( args.variant, commit, time_string(), step, loss)) summary_elements.append( tf.summary.image('fixed-attention-%d' % step, fixed_attention_plot), ) # save the audio and alignment to tensorboard (audio sample rate is hyperparameter) merged = sess.run(tf.summary.merge(summary_elements)) summary_writer.add_summary(merged, step) log('Input: %s' % sequence_to_text(input_seq)) except Exception as e: log('Exiting due to exception: %s' % e, slack=True) traceback.print_exc() coord.request_stop(e)
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') tensorboard_dir = os.path.join(log_dir, 'tacotron_events') meta_folder = os.path.join(log_dir, 'metas') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.base_dir, args.tacotron_input) if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams, args) #Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, hparams, model) # if args.TEST: # for v in tf.global_variables(): # print(v) #Embeddings metadata char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv') if not os.path.isfile(char_embedding_meta): with open(char_embedding_meta, 'w', encoding='utf-8') as f: for symbol in symbols: if symbol == ' ': symbol = '\\s' #For visual purposes, swap space with \s f.write('{}\n'.format(symbol)) char_embedding_meta = char_embedding_meta.replace(log_dir, '..') #Potential Griffin-Lim GPU setup if hparams.GL_on_GPU: GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs') GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs') GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow( GLGPU_mel_inputs, hparams) GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow( GLGPU_lin_inputs, hparams) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) loss_bef_window = ValueWindow(100) loss_aft_window = ValueWindow(100) loss_stop_window = ValueWindow(100) loss_reg_window = ValueWindow(100) loss_emt_window = ValueWindow(100) loss_spk_window = ValueWindow(100) loss_orthog_window = ValueWindow(100) loss_up_emt_window = ValueWindow(100) loss_up_spk_window = ValueWindow(100) loss_mo_up_emt_window = ValueWindow(100) loss_mo_up_spk_window = ValueWindow(100) if args.nat_gan: d_loss_t_window = ValueWindow(100) d_loss_p_window = ValueWindow(100) d_loss_up_window = ValueWindow(100) g_loss_p_window = ValueWindow(100) g_loss_up_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=args.max_to_keep) if args.opt_ref_no_mo and not (args.restart_optimizer_r): print( "WILL ATTEMPT TO RESTORE OPTIMIZER R - SET ARGS.RESTART_OPTIMIZER_R IF RETRAINING A MODEL THAT DIDN'T HAVE THE OPTIMIZER R" ) assert (not (args.restart_nat_gan_d and args.restore_nat_gan_d_sep)) var_list = tf.global_variables() var_list = [v for v in var_list if not ('pretrained' in v.name)] var_list = [ v for v in var_list if not ('nat_gan' in v.name or 'optimizer_n' in v.name) ] if (args.restart_nat_gan_d or args.restore_nat_gan_d_sep) else var_list var_list = [ v for v in var_list if not ('optimizer_r' in v.name or 'optimizer_3' in v.name) ] if args.restart_optimizer_r else var_list saver_restore = tf.train.Saver(var_list=var_list) if args.unpaired and args.pretrained_emb_disc: saver_restore_emt_disc = tf.train.Saver(var_list=[ v for v in tf.global_variables() if ('pretrained_ref_enc_emt' in v.name) ]) saver_restore_spk_disc = tf.train.Saver(var_list=[ v for v in tf.global_variables() if ('pretrained_ref_enc_spk' in v.name) ]) elif args.unpaired and args.pretrained_emb_disc_all: saver_restore_emt_disc = tf.train.Saver(var_list=[ v for v in tf.global_variables() if ('refnet_emt' in v.name) ]) saver_restore_spk_disc = tf.train.Saver(var_list=[ v for v in tf.global_variables() if ('refnet_spk' in v.name) ]) if args.nat_gan: saver_nat_gan = tf.train.Saver(var_list=[ v for v in tf.global_variables() if ('nat_gan' in v.name or 'optimizer_n' in v.name) ]) save_dir_nat_gan = r'nat_gan/pretrained_model' log('Tacotron training set to a maximum of {} steps'.format( args.tacotron_train_steps)) if hparams.tacotron_fine_tuning: print('FINE TUNING SET TO TRUE - MAKE SURE THIS IS WHAT YOU WANT!') #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True eval_feed_dict, emt_labels, spk_labels, \ basenames, basenames_refs = get_eval_feed_dict(hparams, args.synth_metadata_filename, eval_model, args.input_dir, args.flip_spk_emt) #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) # for x in tf.global_variables(): # print(x) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path), slack=True) saver_restore.restore( sess, checkpoint_state.model_checkpoint_path) else: raise ValueError( 'No model to load at {}'.format(save_dir)) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) saver.save(sess, checkpoint_path, global_step=global_step) if args.unpaired and (args.pretrained_emb_disc or args.pretrained_emb_disc_all): save_dir_emt = r'spk_disc/pretrained_model_emt_disc' checkpoint_state_emt = tf.train.get_checkpoint_state( save_dir_emt) saver_restore_emt_disc.restore( sess, checkpoint_state_emt.model_checkpoint_path) log('Loaded Emotion Discriminator from checkpoint {}'.format( checkpoint_state_emt.model_checkpoint_path), slack=True) save_dir_spk = r'spk_disc/pretrained_model_spk_disc' checkpoint_state_spk = tf.train.get_checkpoint_state( save_dir_spk) saver_restore_spk_disc.restore( sess, checkpoint_state_spk.model_checkpoint_path) log('Loaded Speaker Discriminator from checkpoint {}'.format( checkpoint_state_spk.model_checkpoint_path), slack=True) if args.nat_gan and args.restore_nat_gan_d_sep: checkpoint_state_nat_gan = tf.train.get_checkpoint_state( save_dir_nat_gan) saver_nat_gan.restore( sess, checkpoint_state_nat_gan.model_checkpoint_path) log('Loaded Nat Gan Discriminator from checkpoint {}'.format( checkpoint_state_nat_gan.model_checkpoint_path), slack=True) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() # vars = [global_step, model.loss, model.optimize,model.before_loss, model.after_loss,model.stop_token_loss, # model.regularization_loss,model.style_emb_loss_emt, model.style_emb_loss_spk, model.style_emb_orthog_loss] # out = [step, loss, opt, bef, aft, stop, reg, loss_emt, loss_spk, loss_orthog] # message = 'Step {:7d} {:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}, bef={:.5f}, aft={:.5f}, stop={:.5f},' \ # 'reg={:.5f}, emt={:.5f}, spk={:.5f}, orthog={:.5f}'.format(step, time_window.average, loss, loss_window.average, # loss_bef_window.average, loss_aft_window.average, # loss_stop_window.average, loss_reg_window.average, # loss_emt_window.average, loss_spk_window.average, # loss_orthog_window.average) # if args.unpaired: # vars += [model.style_emb_loss_up_emt, model.style_emb_loss_up_spk,model.style_emb_loss_mel_out_up_emt, model.style_emb_loss_mel_out_up_spk] # out += [loss_up_emt, loss_up_spk, loss_mo_up_emt, loss_mo_up_spk] # message += ' up_emt={:.5f}, up_spk={:.5f}, mo_up_emt={:.5f}, mo_up_spk={:.5f}]'.format(loss_up_emt_window.average, # loss_up_spk_window.average, # loss_mo_up_emt_window.average, # loss_mo_up_spk_window.average) # if False: # vars += [model.tower_style_emb_logit_emt[0], model.tower_emt_labels[0],model.tower_style_emb_logit_up_emt[0], # model.tower_emt_up_labels[0],model.tower_spk_labels[0]] # out += [emt_logit, emt_labels, emt_up_logit, emt_up_labels, spk_labels] # # out = sess.run([vars]) if args.nat_gan and (args.restart_nat_gan_d or not (args.restore)) and step == 0: log("Will start with Training Nat GAN Discriminator", end='\r') disc_epochs = 300 if args.unpaired else 200 disc_epochs = 0 if args.TEST else disc_epochs for i in range(disc_epochs + 1): d_loss_t, d_loss_p, d_loss_up,\ d_loss_t_emt, d_loss_p_emt, d_loss_up_emt, \ d_loss_t_spk, d_loss_p_spk, d_loss_up_spk, \ opt_n = sess.run([model.d_loss_targ, model.d_loss_p, model.d_loss_up, model.d_loss_targ_emt, model.d_loss_p_emt, model.d_loss_up_emt, model.d_loss_targ_spk, model.d_loss_p_spk, model.d_loss_up_spk, model.optimize_n]) message = 'step: {}, d_loss_t={:.5f}, d_loss_p ={:.5f}, d_loss_up ={:.5f},' \ ' d_loss_t_emt={:.5f}, d_loss_p_emt ={:.5f}, d_loss_up_emt ={:.5f},' \ ' d_loss_t_spk={:.5f}, d_loss_p_spk ={:.5f}, d_loss_up_spk ={:.5f}'.format(i, d_loss_t, d_loss_p, d_loss_up, d_loss_t_emt, d_loss_p_emt, d_loss_up_emt, d_loss_t_spk, d_loss_p_spk, d_loss_up_spk) log(message, end='\r') os.makedirs(r'nat_gan', exist_ok=True) os.makedirs(r'nat_gan/pretrained_model', exist_ok=True) checkpoint_path_nat_gan = os.path.join( save_dir_nat_gan, 'nat_gan_model.ckpt') saver_nat_gan.save(sess, checkpoint_path_nat_gan, global_step=i) if args.nat_gan: d_loss_t, d_loss_p, d_loss_up, opt_n = sess.run([ model.d_loss_targ, model.d_loss_p, model.d_loss_up, model.optimize_n ]) if args.unpaired: step, tfr, loss, opt, bef, aft, stop, reg, loss_emt, loss_spk, loss_orthog, \ loss_up_emt, loss_up_spk, loss_mo_up_emt, loss_mo_up_spk, g_loss_p, g_loss_up, mels, opt_r\ = sess.run([global_step, model.ratio, model.loss, model.optimize,model.before_loss, model.after_loss,model.stop_token_loss, model.regularization_loss, model.style_emb_loss_emt, model.style_emb_loss_spk, model.style_emb_orthog_loss, model.style_emb_loss_up_emt, model.style_emb_loss_up_spk,model.style_emb_loss_mel_out_up_emt, model.style_emb_loss_mel_out_up_spk,model.g_loss_p, model.g_loss_up, model.tower_mel_outputs[0], model.optimize_r]) else: step, tfr, loss, opt, bef, aft, stop, reg, loss_emt, loss_spk, loss_orthog, \ loss_up_emt, loss_up_spk, loss_mo_up_emt, loss_mo_up_spk, g_loss_p, g_loss_up, mels,dec_out,opt_r = sess.run([global_step, model.helper._ratio, model.loss, model.optimize, model.before_loss, model.after_loss, model.stop_token_loss, model.regularization_loss, model.style_emb_loss_emt, model.style_emb_loss_spk, model.style_emb_orthog_loss, model.style_emb_loss_up_emt, model.style_emb_loss_up_spk,model.style_emb_loss_mel_out_up_emt, model.style_emb_loss_mel_out_up_spk, model.g_loss_p, model.g_loss_up, model.tower_mel_outputs[0],model.tower_decoder_output[0],model.optimize_r]) # step, loss, opt, bef, aft, stop, reg, loss_emt, loss_spk, loss_orthog, \ # loss_up_emt, loss_up_spk, loss_mo_up_emt, loss_mo_up_spk, g_loss_p, g_loss_up, mels,ref_emt,ref_spk,ref_up_emt,ref_up_spk,emb,enc_out,enc_out_up,\ # stop_pred, targ, inp, inp_len,targ_len,stop_targ,mels_up,dec_out,dec_out_up,opt_r\ # = sess.run([global_step, model.loss, model.optimize,model.before_loss, model.after_loss,model.stop_token_loss, # model.regularization_loss, model.style_emb_loss_emt, model.style_emb_loss_spk, model.style_emb_orthog_loss, # model.style_emb_loss_up_emt, model.style_emb_loss_up_spk,model.style_emb_loss_mel_out_up_emt, # model.style_emb_loss_mel_out_up_spk,model.g_loss_p, model.g_loss_up, model.tower_mel_outputs[0], # model.tower_refnet_out_emt[0],model.tower_refnet_out_spk[0],model.tower_refnet_out_up_emt[0],model.tower_refnet_out_up_spk[0], # model.tower_embedded_inputs[0], model.tower_encoder_outputs[0],model.tower_encoder_outputs_up[0],model.tower_stop_token_prediction[0], # model.tower_mel_targets[0],model.tower_inputs[0],model.tower_input_lengths[0],model.tower_targets_lengths[0], # model.tower_stop_token_targets[0],model.tower_mel_outputs_up[0],model.tower_decoder_output[0],model.tower_decoder_output_up[0],model.optimize_r]) # # if args.save_output_vars: # import pandas as pd # pd.DataFrame(emb[:, 0, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\emb.csv') # pd.DataFrame(enc_out[:, 0, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\enc_out.csv') # pd.DataFrame(enc_out_up[:, 0, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\enc_out_up.csv') # pd.DataFrame(stop_pred[:, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\stop.csv') # pd.DataFrame(targ[:, 0, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\targ.csv') # pd.DataFrame(inp[:, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\inp.csv') # pd.DataFrame(inp_len[:]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\inp_len.csv') # pd.DataFrame(targ_len[:]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\targ_len.csv') # pd.DataFrame(stop_targ[:, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\stop_targ.csv') # pd.DataFrame(mels_up[:, 0, 0:5]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\mels_up.csv') # pd.DataFrame(dec_out_up[:, 0, 0:5]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\dec_out_up.csv') if args.save_output_vars: import pandas as pd pd.DataFrame(mels[:, 0, 0:5]).to_csv( r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\mels.csv' ) pd.DataFrame(dec_out[:, 0, 0:5]).to_csv( r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\dec_out.csv' ) # import pandas as pd # print(emt_logit.shape, emt_labels.shape) # if len(emt_logit.shape)>2: # emt_logit = emt_logit.squeeze(1) # emt_up_logit = emt_up_logit.squeeze(1) # emt_labels = emt_labels.reshape(-1,1) # emt_up_labels = emt_up_labels.reshape(-1, 1) # spk_labels = spk_labels.reshape(-1, 1) # df = np.concatenate((emt_logit,emt_labels,spk_labels,emt_up_logit,emt_up_labels),axis=1) # print(emt_labels) # print(emt_logit) # print(emt_up_labels) # print(emt_up_logit) # # pd.DataFrame(df).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\emt_logit_.001_up_10k.csv') # raise time_window.append(time.time() - start_time) loss_window.append(loss) loss_bef_window.append(bef) loss_aft_window.append(aft) loss_stop_window.append(stop) loss_reg_window.append(reg) loss_emt_window.append(loss_emt) loss_spk_window.append(loss_spk) loss_orthog_window.append(loss_orthog) loss_up_emt_window.append(loss_up_emt) loss_up_spk_window.append(loss_up_spk) loss_mo_up_emt_window.append(loss_mo_up_emt) loss_mo_up_spk_window.append(loss_mo_up_spk) if args.nat_gan: d_loss_t_window.append(d_loss_t) d_loss_p_window.append(d_loss_p) d_loss_up_window.append(d_loss_up) g_loss_p_window.append(g_loss_p) g_loss_up_window.append(g_loss_up) message = 'Step {:7d} {:.3f} sec/step, tfr={:.3f}, loss={:.5f}, avg_loss={:.5f}, bef={:.5f}, aft={:.5f}, stop={:.5f}, reg={:.5f}'.format( step, time_window.average, tfr, loss, loss_window.average, loss_bef_window.average, loss_aft_window.average, loss_stop_window.average, loss_reg_window.average) if args.emt_attn: message += ' emt={:.5f}, spk={:.5f}, spk_l2={:.5f}'.format( loss_emt_window.average, loss_spk_window.average, loss_orthog_window.average) else: message += ' emt={:.5f}, spk={:.5f}, orthog={:.5f},'.format( loss_emt_window.average, loss_spk_window.average, loss_orthog_window.average) if args.unpaired: message += ' up_emt={:.5f}, up_spk={:.5f}, mo_up_emt={:.5f}, mo_up_spk={:.5f}'.format( loss_up_emt_window.average, loss_up_spk_window.average, loss_mo_up_emt_window.average, loss_mo_up_spk_window.average) if args.nat_gan: message += ' d_loss_t={:.5f}, d_loss_p ={:.5f}, d_loss_up ={:.5f}, g_loss_p ={:.5f}, g_loss_up ={:.5f}'.format( d_loss_t_window.average, d_loss_p_window.average, d_loss_up_window.average, g_loss_p_window.average, g_loss_up_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if np.isnan(loss) or loss > 100.: log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) # if step % args.eval_interval == 0: # #Run eval and save eval stats # log('\nRunning evaluation and saving model at step {}'.format(step)) # saver.save(sess, checkpoint_path, global_step=global_step) # # eval_losses = [] # before_losses = [] # after_losses = [] # stop_token_losses = [] # linear_losses = [] # linear_loss = None # # if hparams.predict_linear: # for i in tqdm(range(feeder.test_steps)): # eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run([ # eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], # eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0], # eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], # eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0], # eval_model.tower_linear_targets[0][0], # ]) # eval_losses.append(eloss) # before_losses.append(before_loss) # after_losses.append(after_loss) # stop_token_losses.append(stop_token_loss) # linear_losses.append(linear_loss) # linear_loss = sum(linear_losses) / len(linear_losses) # # if hparams.GL_on_GPU: # wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: lin_p}) # wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) # else: # wav = audio.inv_linear_spectrogram(lin_p.T, hparams) # audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) # # else: # for i in tqdm(range(feeder.test_steps)): # eloss, before_loss, after_loss, stop_token_loss, input_seq, mel_p, mel_t, t_len, align = sess.run([ # eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], # eval_model.tower_stop_token_loss[0],eval_model.tower_inputs[0][0], eval_model.tower_mel_outputs[0][0], # eval_model.tower_mel_targets[0][0], # eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] # ]) # eval_losses.append(eloss) # before_losses.append(before_loss) # after_losses.append(after_loss) # stop_token_losses.append(stop_token_loss) # # eval_loss = sum(eval_losses) / len(eval_losses) # before_loss = sum(before_losses) / len(before_losses) # after_loss = sum(after_losses) / len(after_losses) # stop_token_loss = sum(stop_token_losses) / len(stop_token_losses) # # # log('Saving eval log to {}..'.format(eval_dir)) # #Save some log to monitor model improvement on same unseen sequence # if hparams.GL_on_GPU: # wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_p}) # wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) # else: # wav = audio.inv_mel_spectrogram(mel_p.T, hparams) # audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) # # input_seq = sequence_to_text(input_seq) # plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), # title='{}, {}, step={}, loss={:.5f}\n{}'.format(args.model, time_string(), step, eval_loss, input_seq), # max_len=t_len // hparams.outputs_per_step) # plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), # title='{}, {}, step={}, loss={:.5f}\n{}'.format(args.model, time_string(), step, eval_loss,input_seq), target_spectrogram=mel_t, # max_len=t_len) # # if hparams.predict_linear: # plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir, 'step-{}-eval-linear-spectrogram.png'.format(step)), # title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=lin_t, # max_len=t_len, auto_aspect=True) # # log('Step {:7d} [eval loss: {:.3f}, before loss: {:.3f}, after loss: {:.3f}, stop loss: {:.3f}]'.format(step, eval_loss, before_loss, after_loss, stop_token_loss)) # # log('Writing eval summary!') # add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300: #Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaved model at step {}'.format(step)) if step % args.eval_interval == 0: if hparams.predict_linear: raise ValueError('predict linear not implemented') # input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run([ # model.tower_inputs[0][0], # model.tower_mel_outputs[0][0], # model.tower_linear_outputs[0][0], # model.tower_alignments[0][0], # model.tower_mel_targets[0][0], # model.tower_targets_lengths[0][0], # model.tower_linear_targets[0][0], # ]) # # #save predicted linear spectrogram to disk (debug) # linear_filename = 'linear-prediction-step-{}.npy'.format(step) # np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) # # #save griffin lim inverted wav for debug (linear -> wav) # if hparams.GL_on_GPU: # wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: linear_prediction}) # wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) # else: # wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams) # audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) # # #Save real and predicted linear-spectrogram plot to disk (control purposes) # plot.plot_spectrogram(linear_prediction, os.path.join(plot_dir, 'step-{}-linear-spectrogram.png'.format(step)), # title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=linear_target, # max_len=target_length, auto_aspect=True) else: input_seqs, mels, alignments,\ stop_tokens = sess.run([eval_model.tower_inputs, eval_model.tower_mel_outputs, eval_model.tower_alignments, eval_model.tower_stop_token_prediction], feed_dict=eval_feed_dict) # num_evals = len(input_seqs) if False else 1 # for i in range(num_evals): # input_seq = input_seqs[i] # mel_prediction = mel_predictions[i] # alignment = alignments[i] # target = targets[i] # target_length = target_lengths[i] # emt = emts[i] # spk = spks[i] # if args.emt_attn and args.attn=='simple': # alignment_emt = alignments_emt[0][i] # Linearize outputs (n_gpus -> 1D) inp = [ inp for gpu_inp in input_seqs for inp in gpu_inp ] mels = [mel for gpu_mels in mels for mel in gpu_mels] # targets = [target for gpu_targets in targets for target in gpu_targets] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] try: target_lengths = get_output_lengths(stop_tokens) # Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip( mels, target_lengths) ] T2_output_range = ( -hparams.max_abs_value, hparams.max_abs_value ) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) mels = [ np.clip(m, T2_output_range[0], T2_output_range[1]) for m in mels ] folder_bucket = 'step_{}'.format(step // 500) folder_wavs_save = os.path.join( wav_dir, folder_bucket) folder_plot_save = os.path.join( plot_dir, folder_bucket) os.makedirs(folder_wavs_save, exist_ok=True) os.makedirs(folder_plot_save, exist_ok=True) for i, (mel, align, basename, basename_ref) in enumerate( zip(mels, alignments, basenames, basenames_refs)): #save griffin lim inverted wav for debug (mel -> wav) if hparams.GL_on_GPU: wav = sess.run( GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel}) wav = audio.inv_preemphasis( wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_mel_spectrogram( mel.T, hparams) audio.save_wav( wav, os.path.join( folder_wavs_save, 'step_{}_wav_{}_{}_{}.wav'.format( step, i, basename, basename_ref)), sr=hparams.sample_rate) input_seq = sequence_to_text(inp[i]) #save alignment plot to disk (control purposes) try: plot.plot_alignment( align, os.path.join( folder_plot_save, 'step_{}_wav_{}_{}_{}_align.png'. format(step, i, basename, basename_ref)), title='{}, {}, step={}\n{}'.format( args.model, time_string(), step, input_seq), max_len=target_lengths[i] // hparams.outputs_per_step) except: print("failed to plot alignment") try: #save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel, os.path.join( folder_plot_save, 'step-{}-{}-mel-spectrogram.png'. format(step, i)), title='{}, {}, step={}\n{}'.format( args.model, time_string(), step, input_seq)) # target_spectrogram=targets[i], # max_len=target_lengths[i]) except: print("failed to plot spectrogram") log('Saved synthesized samples for step {}'.format( step), end='\r') except: print("Couldn't synthesize samples") # log('Input at step {}: {}'.format(step, input_seq), end='\r') # if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: # #Get current checkpoint state # checkpoint_state = tf.train.get_checkpoint_state(save_dir) # # #Update Projector # log('\nSaving Model Character Embeddings visualization..') # add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path) # log('Tacotron Character embeddings have been updated on tensorboard!') log('Tacotron training complete after {} global steps!'.format( args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def train(path, args): tf.reset_default_graph() # reset graph timestamp = time_string() if args.time_string == None else args.time_string # draw graph feeder = Feeder(args.train_filename, args, hparams) output_classes = max( [int(f) for f in feeder.total_emt]) + 1 if args.model_type in [ 'emt', 'accent' ] else max([int(f) for f in feeder.total_spk]) + 1 batch = tf.placeholder( shape=[args.N * args.M, None, config.n_mels], dtype=tf.float32) # input batch (time x batch x n_mel) labels = tf.placeholder(shape=[args.N * args.M], dtype=tf.int32) lr = tf.placeholder(dtype=tf.float32) # learning rate global_step = tf.Variable(0, name='global_step', trainable=False) w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32)) b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32)) # embedded = triple_lstm(batch) print("Training {} Discriminator Model".format(args.model_type)) encoder = ReferenceEncoder( filters=hparams.reference_filters, kernel_size=(3, 3), strides=(2, 2), is_training=True, scope='Tacotron_model/inference/pretrained_ref_enc_{}'.format( args.model_type), depth=hparams.reference_depth) # [N, 128]) embedded = encoder(batch) embedded = normalize(embedded) if args.discriminator: logit = tf.layers.dense( embedded, output_classes, name='Tacotron_model/inference/pretrained_ref_enc_{}_dense'.format( args.model_type)) labels_one_hot = tf.one_hot(tf.to_int32(labels), output_classes) # loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logit,labels=labels_one_hot)) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=labels_one_hot)) acc, acc_op = tf.metrics.accuracy(labels=tf.argmax(labels_one_hot, 1), predictions=tf.argmax(logit, 1)) val_acc, val_acc_op = tf.metrics.accuracy( labels=tf.argmax(labels_one_hot, 1), predictions=tf.argmax(logit, 1)) else: # loss sim_matrix = similarity(embedded, w, b, args.N, args.M, P=hparams.reference_depth) print("similarity matrix size: ", sim_matrix.shape) loss = loss_cal(sim_matrix, args.N, args.M, type=config.loss) val_acc_op = tf.constant(1.) # optimizer operation trainable_vars = tf.trainable_variables() # get variable list optimizer = optim( lr) # get optimizer (type is determined by configuration) grads, vars = zip(*optimizer.compute_gradients( loss)) # compute gradients of variables with respect to loss if args.discriminator: grads_rescale = grads else: grads_clip, _ = tf.clip_by_global_norm(grads, 3.0) # l2 norm clipping by 3 grads_rescale = [0.01 * grad for grad in grads_clip[:2] ] + grads_clip[2:] # smaller gradient scale for w, b train_op = optimizer.apply_gradients( zip(grads_rescale, vars), global_step=global_step) # gradient update operation # check variables memory variable_count = np.sum( np.array([ np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars ])) print("total variables :", variable_count) # record loss loss_summary = tf.summary.scalar("loss", loss) merged = tf.summary.merge_all() saver = tf.train.Saver(max_to_keep=20) loss_window = ValueWindow(100) acc_window = ValueWindow(100) val_loss_window = ValueWindow(5) val_acc_window = ValueWindow(5) # training session with tf.Session() as sess: tf.local_variables_initializer().run() tf.global_variables_initializer().run() checkpoint_folder = os.path.join(path, "checkpoints", timestamp) logs_folder = os.path.join(path, "logs", timestamp) os.makedirs(checkpoint_folder, exist_ok=True) # make folder to save model os.makedirs(logs_folder, exist_ok=True) # make folder to save log model_name = '{}_disc_model.ckpt'.format(args.model_type) checkpoint_path = os.path.join(checkpoint_folder, model_name) if args.restore: checkpoint_state = tf.train.get_checkpoint_state(checkpoint_folder) if (checkpoint_state and checkpoint_state.model_checkpoint_path): print('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: print('No model to load at {}'.format(checkpoint_folder)) saver.save(sess, checkpoint_path, global_step=global_step) else: print('Starting new training!') saver.save(sess, checkpoint_path, global_step=global_step) writer = tf.summary.FileWriter(logs_folder, sess.graph) lr_factor = 1 # lr decay factor ( 1/2 per 10000 iteration) iterations = 30000 if args.model_type == 'emt' else config.iteration for iter in range(iterations): if args.discriminator: batch_iter, _, labels_iter = feeder.random_batch_disc() else: batch_iter, _, labels_iter = feeder.random_batch() # run forward and backward propagation and update parameters step, _, loss_cur, summary, acc_cur = sess.run( [global_step, train_op, loss, merged, acc_op], feed_dict={ batch: batch_iter, labels: labels_iter, lr: config.lr * lr_factor }) loss_window.append(loss_cur) acc_window.append(acc_cur) if step % 10 == 0: writer.add_summary(summary, step) # write at tensorboard if (step + 1) % 20 == 0: val_loss_cur_batch = 0 val_acc_cur_batch = 0 for iter in range(VAL_ITERS): if args.discriminator: batch_iter, _, labels_iter = feeder.random_batch_disc( TEST=True) else: batch_iter, _, labels_iter = feeder.random_batch( TEST=True) # run forward and backward propagation and update parameters val_loss_cur, val_acc_cur = sess.run([loss, val_acc_op], feed_dict={ batch: batch_iter, labels: labels_iter }) val_loss_cur_batch += val_loss_cur val_acc_cur_batch += val_acc_cur val_loss_cur_batch /= VAL_ITERS val_acc_cur_batch /= VAL_ITERS val_loss_window.append(val_loss_cur_batch) val_acc_window.append(val_acc_cur_batch) message = "(iter : %d) loss: %.4f" % ( (step + 1), loss_window.average) if args.discriminator: message += ', acc: {:.2f}%'.format(acc_window.average) message += ", val_loss: %.4f" % (val_loss_window.average) if args.discriminator: message += ', val_acc: {:.2f}%'.format( val_acc_window.average) print(message) lr_changed = False if args.model_type == 'emt': if step > 6000: lr_changed = True if lr_factor != .01 else False lr_factor = .01 elif step > 4000: lr_changed = True if lr_factor != .1 else False lr_factor = .1 if lr_changed: print("learning rate is decayed! current lr : ", config.lr * lr_factor) elif args.model_type == 'spk': if step > 300: #4000: lr_changed = True if lr_factor != .01 else False lr_factor = .01 elif step > 180: #2500: lr_changed = True if lr_factor != .1 else False lr_factor = .1 if lr_changed: print("learning rate is decayed! current lr : ", config.lr * lr_factor) if step % config.save_checkpoint_iters == 0: saver.save(sess, checkpoint_path, global_step=global_step)
def train(log_dir, args, hparams, input_path): save_dir = os.path.join(log_dir, 'wave_pretrained/') eval_dir = os.path.join(log_dir, 'eval-dir') audio_dir = os.path.join(log_dir, 'wavs') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') eval_audio_dir = os.path.join(eval_dir, 'wavs') eval_plot_dir = os.path.join(eval_dir, 'plots') checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt') input_path = os.path.join(args.base_dir, input_path) os.makedirs(save_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(audio_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(eval_audio_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) log('Checkpoint_path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.wavenet_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, args.base_dir, hparams) #Set up model global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) #book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) sh_saver = create_shadow_saver(model, global_step) log('Wavenet training set to a maximum of {} steps'.format(args.wavenet_train_steps)) #Memory allocation on the memory config = tf.ConfigProto() config.gpu_options.allow_growth = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: #Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e)) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path)) load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path) else: if not args.restore: log('Starting new training!') else: log('No model to load at {}'.format(save_dir)) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.wavenet_train_steps: start_time = time.time() step, y_hat, loss, opt = sess.run([global_step, model.y_hat, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r') if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0: save_log(sess, step, model, plot_dir, audio_dir, hparams=hparams) save_checkpoint(sess, sh_saver, checkpoint_path, global_step) if step % args.eval_interval == 0: log('\nEvaluating at step {}'.format(step)) eval_step(sess, step, eval_model, eval_plot_dir, eval_audio_dir, summary_writer=summary_writer , hparams=model._hparams) log('Wavenet training complete after {} global steps'.format(args.wavenet_train_steps)) except Exception as e: log('Exiting due to Exception: {}'.format(e))