def run_eval(args, checkpoint, hparams, sentences): try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: raise ValueError( 'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: raise ValueError( 'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) output_dir = get_synthesis_output_dir(args.caching_dir) eval_dir = get_evals_dir(args.caching_dir) log_dir = os.path.join(output_dir, 'logs-eval') #if args.model == 'Tacotron-2': #assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer(args.caching_dir) synth.load(checkpoint_path, hparams) #Set inputs batch wise sentences = [ sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range( 0, len(sentences), hparams.tacotron_synthesis_batch_size) ] log('Starting Synthesis') with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, texts in enumerate(tqdm(sentences)): start = time.time() basenames = [ 'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts)) ] mel_filenames, speaker_ids = synth.synthesize( texts, basenames, eval_dir, log_dir, None) for elems in zip(texts, mel_filenames, speaker_ids): file.write('|'.join([str(x) for x in elems]) + '\n') log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_synthesis(args, checkpoint, hparams): try: checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint)) if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: raise ValueError('Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'.format( hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: raise ValueError('Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'.format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) gta = args.GTA == 'True' synth_dir = get_synth_dir(args.caching_dir, gta) gta_map_file = get_gta_map_file(synth_dir) #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) metadata_path = get_train_txt(args.caching_dir) metadata = load_meta(metadata_path) log(hparams_debug_string()) synth = Synthesizer(args.caching_dir) synth.load(checkpoint_path, hparams, gta=gta) frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[2]) for x in metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours)) #Set inputs batch wise metadata = [metadata[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size)] log('Starting Synthesis') txt_dir = get_txt_dir(args.caching_dir) mel_dir = get_mel_dir(args.caching_dir) wav_dir = get_wav_dir(args.caching_dir) symbol_file = get_symbols_file(args.caching_dir) conv = get_from_file(symbol_file) with open(gta_map_file, 'w') as file: for i, meta in enumerate(tqdm(metadata)): if i % 10 == 0: text_paths = [os.path.join(txt_dir, "{}.npy".format(m[0])) for m in meta] text_symbols = [np.load(pth) for pth in text_paths] # trim ~ at the end texts = [conv.sequence_to_original_text(x) for x in text_symbols] #texts = [m[5] for m in meta] mel_filenames = [os.path.join(mel_dir, "{}.npy".format(m[0])) for m in meta] wav_filenames = [os.path.join(wav_dir, "{}.npy".format(m[0])) for m in meta] basenames = [m[0] for m in meta] mel_output_filenames, speaker_ids = synth.synthesize(texts, basenames, synth_dir, None, mel_filenames) for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts): file.write('|'.join([str(x) for x in elems]) + '\n') log('synthesized mel spectrograms at {}'.format(synth_dir)) return gta_map_file
def run_live(args, checkpoint, hparams): # if args.mode != eval or synthesis try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: raise ValueError( 'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: raise ValueError( 'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!' .format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) #Log to Terminal without keeping any records in files log(hparams_debug_string()) synth = Synthesizer(args.caching_dir) synth.load(checkpoint_path, hparams) #Generate fast greeting message greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!' log(greetings) generate_fast(synth, greetings) #Interaction loop while True: try: text = input() generate_fast(synth, text) except KeyboardInterrupt: leave = 'Thank you for testing our features. see you soon.' log(leave) generate_fast(synth, leave) sleep(2) break
def train(log_dir, args, hparams): symbol_file = get_symbols_file(args.caching_dir) symbol_converter = get_from_file(symbol_file) symbols_count = symbol_converter.get_symbols_count() save_dir = get_save_dir(log_dir) plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') tensorboard_dir = os.path.join(log_dir, 'tacotron_events') meta_folder = os.path.join(log_dir, 'metas') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') caching_dir = args.caching_dir if hparams.predict_linear: linear_dir = os.path.join(log_dir, 'linear-spectrograms') os.makedirs(linear_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(caching_dir)) log('Using model: {}'.format("tacotron")) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, caching_dir, hparams) #Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step, symbols_count) eval_model = model_test_mode(args, feeder, hparams, global_step, symbols_count) #Potential Griffin-Lim GPU setup if hparams.GL_on_GPU: GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs') GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs') GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow( GLGPU_mel_inputs, hparams) GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow( GLGPU_lin_inputs, hparams) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=20) log('Tacotron training set to a maximum of {} steps'.format( args.tacotron_train_steps)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True #Train sess = tf.Session(config=config) summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path), slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log('No model to load at {}'.format(save_dir), slack=True) saver.save(sess, checkpoint_path, global_step=global_step) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) saver.save(sess, checkpoint_path, global_step=global_step) #initializing feeder threads = feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if np.isnan(loss) or loss > 100.: log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: #Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) if feeder.test_steps == 0: log('zero test steps, skipping...') else: eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None if hparams.predict_linear: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run( [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0], eval_model.tower_linear_targets[0][0], ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) if (len(linear_losses) != 0): linear_loss = sum(linear_losses) / len(linear_losses) else: linear_loss = 0 log('len(linear_losses) was 0') if hparams.GL_on_GPU: wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: lin_p}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run( [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) if (len(eval_losses) != 0): eval_loss = sum(eval_losses) / len(eval_losses) else: eval_loss = 0 log('len(eval_losses) was 0') if (len(before_losses) != 0): before_loss = sum(before_losses) / len(before_losses) else: before_loss = 0 log('len(before_losses) was 0') if (len(after_losses) != 0): after_loss = sum(after_losses) / len(after_losses) else: after_loss = 0 log('len(after_losses) was 0') if (len(stop_token_losses) != 0): stop_token_loss = sum(stop_token_losses) / len( stop_token_losses) else: stop_token_loss = 0 log('len(stop_token_losses) was 0') log('Saving eval log to {}..'.format(eval_dir)) #Save some log to monitor model improvement on same unseen sequence if hparams.GL_on_GPU: wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_p}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, 'step-{}-eval-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) plot.plot_alignment( align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( "tacotron", time_string(), step, eval_loss), max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram( mel_p, os.path.join( eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( "tacotron", time_string(), step, eval_loss), target_spectrogram=mel_t, max_len=t_len) if hparams.predict_linear: plot.plot_spectrogram( lin_p, os.path.join( eval_plot_dir, 'step-{}-eval-linear-spectrogram.png'.format( step)), title='{}, {}, step={}, loss={:.5f}'.format( "tacotron", time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) log('Eval loss for global step {}: {:.3f}'.format( step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300: #Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) if hparams.predict_linear: input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_linear_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], model.tower_linear_targets[0][0], ]) #save predicted linear spectrogram to disk (debug) linear_filename = 'linear-prediction-step-{}.npy'.format(step) np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (linear -> wav) if hparams.GL_on_GPU: wav = sess.run( GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: linear_prediction}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams) audio.save_wav( wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) #Save real and predicted linear-spectrogram plot to disk (control purposes) plot.plot_spectrogram( linear_prediction, os.path.join( plot_dir, 'step-{}-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( "tacotron", time_string(), step, loss), target_spectrogram=linear_target, max_len=target_length, auto_aspect=True) else: input_seq, mel_prediction, alignment, target, target_length = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], ]) #save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (mel -> wav) if hparams.GL_on_GPU: wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_prediction}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) else: wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav(wav, os.path.join( wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) #save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( "tacotron", time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) #save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( "tacotron", time_string(), step, loss), target_spectrogram=target, max_len=target_length) original_text = symbol_converter.sequence_to_text(input_seq) log('Input at step {}: {}'.format(step, original_text)) if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: #Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) #Update Projector log('\nSaving Model Character Embeddings visualization..') add_embedding_stats(summary_writer, [model.embedding_table.name], checkpoint_state.model_checkpoint_path) log('Tacotron Character embeddings have been updated on tensorboard!' ) log("train it finished.") log("request stop.") coord.request_stop() log("waitfor stop.") coord.wait_for_stop() log("close queue.") #feeder.close_queue() # do not finish. log("wait threads exit.") #coord.join(threads) try: log("close session.") sess.close() log("reset graph.") tf.reset_default_graph() except Exception as e: log("Session bug occured.") #log('Exiting due to exception: {}'.format(e), slack=True) #traceback.print_exc() #coord.request_stop(e) #coord.wait_for_stop() #raise Exception('Exception occured.') sleep(0.5) log('Tacotron training complete after {} global steps!'.format( args.tacotron_train_steps), slack=True)
def train(log_dir, args, hparams): save_dir = get_save_dir(log_dir) plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') eval_dir = os.path.join(log_dir, 'eval-dir') eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') tensorboard_dir = os.path.join(log_dir, 'wavenet_events') meta_folder = os.path.join(log_dir, 'metas') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt') gta = args.GTA == 'True' synth_dir = get_synth_dir(args.caching_dir, gta) gta_map_file = get_gta_map_file(synth_dir) log('Checkpoint_path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(gta_map_file)) log('Using model: {}'.format('WaveNet')) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.wavenet_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, gta_map_file, hparams) #Set up model global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) #Speaker Embeddings metadata if hparams.speakers_path is not None: speaker_embedding_meta = hparams.speakers_path else: speaker_embedding_meta = os.path.join(meta_folder, 'SpeakerEmbeddings.tsv') if not os.path.isfile(speaker_embedding_meta): with open(speaker_embedding_meta, 'w', encoding='utf-8') as f: for speaker in hparams.speakers: f.write('{}\n'.format(speaker)) speaker_embedding_meta = speaker_embedding_meta.replace(log_dir, '..') #book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) sh_saver = create_shadow_saver(model, global_step) log('Wavenet training set to a maximum of {} steps'.format( args.wavenet_train_steps)) #Memory allocation on the memory config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True run_init = False #Train sess = tf.Session(config=config) summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path), slack=True) load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path) else: log('No model to load at {}'.format(save_dir), slack=True) if hparams.wavenet_weight_normalization: run_init = True except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) if hparams.wavenet_weight_normalization: run_init = True if run_init: log('\nApplying Weight normalization in fresh training. Applying data dependent initialization forward pass..' ) #Create init_model init_model, _ = model_train_mode(args, feeder, hparams, global_step, init=True) #initializing feeder feeder.start_threads(sess) if run_init: #Run one forward pass for model parameters initialization (make prediction on init_batch) _ = sess.run(init_model.tower_y_hat) log('Data dependent initialization done. Starting training!') #Training loop while not coord.should_stop() and step < args.wavenet_train_steps: start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if np.isnan(loss) or loss > 100: log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps: save_log(sess, step, model, plot_dir, wav_dir, hparams=hparams, model_name='WaveNet') save_checkpoint(sess, sh_saver, checkpoint_path, global_step) if step % args.eval_interval == 0: log('\nEvaluating at step {}'.format(step)) eval_step(sess, step, eval_model, eval_plot_dir, eval_wav_dir, summary_writer=summary_writer, hparams=model._hparams, model_name='WaveNet') if hparams.gin_channels > 0 and (step % args.embedding_interval == 0 or step == args.wavenet_train_steps or step == 1): #Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) #Update Projector log('\nSaving Model Speaker Embeddings visualization..') add_embedding_stats(summary_writer, [model.embedding_table.name], [speaker_embedding_meta], checkpoint_state.model_checkpoint_path) log('WaveNet Speaker embeddings have been updated on tensorboard!') log('Wavenet training complete after {} global steps'.format( args.wavenet_train_steps), slack=True) coord.request_stop() coord.wait_for_stop() try: sess.close() tf.reset_default_graph() except: log("Session bug occured.") # except Exception as e: # log('Exiting due to exception: {}'.format(e), slack=True) # traceback.print_exc() # coord.request_stop(e) # coord.wait_for_stop() # raise Exception('Exception occured.') sleep(0.5)
def run_synthesis(args, checkpoint, caching_dir, hparams): output_dir = get_output_dir(caching_dir) try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) log_dir = os.path.join(output_dir, 'plots') wav_dir = os.path.join(output_dir, 'wavs') #We suppose user will provide correct folder depending on training method log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) #if args.model == 'Tacotron-2': #If running all Tacotron-2, synthesize audio from evaluated mels evals_dir = get_evals_dir(args.caching_dir) metadata_filename = os.path.join(evals_dir, 'map.txt') with open(metadata_filename, encoding='utf-8') as f: metadata = np.array([line.strip().split('|') for line in f]) speaker_ids = metadata[:, 2] mel_files = metadata[:, 1] texts = metadata[:, 0] speaker_ids = None if (speaker_ids == '<no_g>').all() else speaker_ids # else: # #else Get all npy files in input_dir (supposing they are mels) # mel_files = sorted([os.path.join(args.mels_dir, f) for f in os.listdir(args.mels_dir) if f.split('.')[-1] == 'npy']) # speaker_ids = None if args.speaker_id is None else args.speaker_id.replace(' ', '').split(',') # if speaker_ids is not None: # assert len(speaker_ids) == len(mel_files) # texts = None log('Starting synthesis! (this will take a while..)') os.makedirs(log_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) mel_files = [ mel_files[i:i + hparams.wavenet_synthesis_batch_size] for i in range(0, len(mel_files), hparams.wavenet_synthesis_batch_size) ] speaker_ids = None if speaker_ids is None else [ speaker_ids[i:i + hparams.wavenet_synthesis_batch_size] for i in range( 0, len(speaker_ids), hparams.wavenet_synthesis_batch_size) ] texts = None if texts is None else [ texts[i:i + hparams.wavenet_synthesis_batch_size] for i in range(0, len(texts), hparams.wavenet_synthesis_batch_size) ] with open(os.path.join(wav_dir, 'map.txt'), 'w') as file: for i, mel_batch in enumerate(tqdm(mel_files)): mel_spectros = [np.load(mel_file) for mel_file in mel_batch] basenames = [ os.path.basename(mel_file).replace('.npy', '') for mel_file in mel_batch ] speaker_id_batch = None if speaker_ids is None else speaker_ids[i] audio_files = synth.synthesize(mel_spectros, speaker_id_batch, basenames, wav_dir, log_dir) speaker_logs = ['<no_g>'] * len( mel_batch) if speaker_id_batch is None else speaker_id_batch for j, mel_file in enumerate(mel_batch): if texts is None: file.write('{}|{}\n'.format(mel_file, audio_files[j], speaker_logs[j])) else: file.write('{}|{}|{}\n'.format(texts[i][j], mel_file, audio_files[j], speaker_logs[j])) log('synthesized audio waveforms at {}'.format(wav_dir))