def eval_model(attention, mel_prediction, target_spectrogram, input_seq, step, plot_dir, mel_output_dir, wav_dir, sample_num, loss, hparams): # Save some results for evaluation attention_path = str( plot_dir.joinpath("attention_step_{}_sample_{}".format( step, sample_num))) save_attention(attention, attention_path) # save predicted mel spectrogram to disk (debug) mel_output_fpath = mel_output_dir.joinpath( "mel-prediction-step-{}_sample_{}.npy".format(step, sample_num)) np.save(str(mel_output_fpath), mel_prediction, allow_pickle=False) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) wav_fpath = wav_dir.joinpath("step-{}-wave-from-mel_sample_{}.wav".format( step, sample_num)) audio.save_wav(wav, str(wav_fpath), sr=hparams.sample_rate) # save real and predicted mel-spectrogram plot to disk (control purposes) spec_fpath = plot_dir.joinpath( "step-{}-mel-spectrogram_sample_{}.png".format(step, sample_num)) title_str = "{}, {}, step={}, loss={:.5f}".format("Tacotron", time_string(), step, loss) plot_spectrogram(mel_prediction, str(spec_fpath), title=title_str, target_spectrogram=target_spectrogram, max_len=target_spectrogram.size // hparams.num_mels) print("Input at step {}: {}".format(step, sequence_to_text(input_seq)))
def run_mel_strip(): import numpy as np from tools.spec_processor import find_endpoint, find_silences from synthesizer.audio import inv_mel_spectrogram, save_wav from synthesizer.hparams import hparams from matplotlib import pyplot as plt inpath = Path( r'E:\lab\zhrtvc\zhrtvc\toolbox\saved_files\mels\wavs-P00173I-001_20170001P00173I0068.wav_1567509749_我家朵朵是世界上最漂亮的朵朵。。知道自己是什么样的人。要做什么。无需活在别人非议或期待里。你勤奋.npy' ) data = np.load(inpath) data = data.T print(data.shape) end_idx = find_silences(data, min_silence_sec=0.5, hop_silence_sec=0.2) print(end_idx, len(data)) out_dir = Path(r'data/syns') for i, pair in enumerate(zip(end_idx[:-1], end_idx[1:]), 1): a, b = pair wav = inv_mel_spectrogram(data[a[-1]:b[0]].T, hparams) save_wav(wav, out_dir.joinpath(f'sil-{i:02d}.wav'), hparams.sample_rate) plt.imshow(data.T) plt.colorbar() plt.show()
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, "taco_pretrained") plot_dir = os.path.join(log_dir, "plots") wav_dir = os.path.join(log_dir, "wavs") mel_dir = os.path.join(log_dir, "mel-spectrograms") eval_dir = os.path.join(log_dir, "eval-dir") eval_plot_dir = os.path.join(eval_dir, "plots") eval_wav_dir = os.path.join(eval_dir, "wavs") tensorboard_dir = os.path.join(log_dir, "tacotron_events") meta_folder = os.path.join(log_dir, "metas") os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt") if hparams.if_use_speaker_classifier: metadat_fpath = os.path.join(args.synthesizer_root, "train_augment_speaker.txt") else: metadat_fpath = os.path.join(args.synthesizer_root, "train.txt") log("Checkpoint path: {}".format(checkpoint_fpath)) log("Loading training data from: {}".format(metadat_fpath)) log("Using model: Tacotron") log(hparams_debug_string()) # Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) # Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope("datafeeder") as scope: feeder = Feeder(coord, metadat_fpath, hparams) # Set up model: global_step = tf.Variable(0, name="global_step", trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) # Embeddings metadata char_embedding_meta = os.path.join(meta_folder, "CharacterEmbeddings.tsv") if not os.path.isfile(char_embedding_meta): with open(char_embedding_meta, "w", encoding="utf-8") as f: for symbol in symbols: if symbol == " ": symbol = "\\s" # For visual purposes, swap space with \s f.write("{}\n".format(symbol)) char_embedding_meta = char_embedding_meta.replace(log_dir, "..") # Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) log("Tacotron training set to a maximum of {} steps".format( args.tacotron_train_steps)) # Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True # Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) # saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if checkpoint_state and checkpoint_state.model_checkpoint_path: log("Loading checkpoint {}".format( checkpoint_state.model_checkpoint_path), slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log("No model to load at {}".format(save_dir), slack=True) saver.save(sess, checkpoint_fpath, global_step=global_step) except tf.errors.OutOfRangeError as e: log("Cannot restore checkpoint: {}".format(e), slack=True) else: log("Starting new training!", slack=True) saver.save(sess, checkpoint_fpath, global_step=global_step) # initializing feeder feeder.start_threads(sess) # Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, adversial_loss, opt = sess.run([ global_step, model.loss, model.adversial_loss, model.optimize ]) loss -= adversial_loss time_window.append(time.time() - start_time) loss_window.append(loss) message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}, adv_loss={:.5f}]".format( step, time_window.average, loss, loss_window.average, adversial_loss) log(message, end="\r", slack=(step % args.checkpoint_interval == 0)) print(message) if loss > 100 or np.isnan(loss): log("Loss exploded to {:.5f} at step {}".format( loss, step)) raise Exception("Loss exploded") if step % args.summary_interval == 0: log("\nWriting summary at step {}".format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: # Run eval and save eval stats log("\nRunning evaluation at step {}".format(step)) eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None adversial_losses = [] if hparams.predict_linear: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, \ mel_t, t_len, align, lin_p, lin_t = sess.run( [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0], eval_model.tower_linear_targets[0][0], ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) linear_loss = sum(linear_losses) / len(linear_losses) wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, "step-{}-eval-wave-from-linear.wav".format( step)), sr=hparams.sample_rate) else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, adversial_loss, mel_p, mel_t, t_len,\ align = sess.run( [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_adversial_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) adversial_losses.append(adversial_loss) eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) stop_token_loss = sum(stop_token_losses) / len( stop_token_losses) adversial_loss = sum(adversial_losses) / len( adversial_losses) log("Saving eval log to {}..".format(eval_dir)) # Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, "step-{}-eval-wave-from-mel.wav".format(step)), sr=hparams.sample_rate) plot.plot_alignment( align, os.path.join(eval_plot_dir, "step-{}-eval-align.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, eval_loss), max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram( mel_p, os.path.join( eval_plot_dir, "step-{" "}-eval-mel-spectrogram.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, eval_loss), target_spectrogram=mel_t, max_len=t_len) if hparams.predict_linear: plot.plot_spectrogram( lin_p, os.path.join( eval_plot_dir, "step-{}-eval-linear-spectrogram.png".format( step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) log("Eval loss for global step {}: {:.3f}".format( step, eval_loss)) log("Writing eval summary!") add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, adversial_loss, eval_loss) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \ step == 300: # Save model and current global step saver.save(sess, checkpoint_fpath, global_step=global_step) log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform.." ) input_seq, mel_prediction, alignment, target, target_length = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], ]) # save predicted mel spectrogram to disk (debug) mel_filename = "mel-prediction-step-{}.npy".format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav( wav, os.path.join(wav_dir, "step-{}-wave-from-mel.wav".format(step)), sr=hparams.sample_rate) # save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, "step-{}-align.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) # save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join( plot_dir, "step-{}-mel-spectrogram.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, loss), target_spectrogram=target, max_len=target_length) #log("Input at step {}: {}".format(step, sequence_to_text(input_seq))) if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: # Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) # Update Projector #log("\nSaving Model Character Embeddings visualization..") #add_embedding_stats(summary_writer, [model.embedding_table.name], # [char_embedding_meta], # checkpoint_state.model_checkpoint_path) #log("Tacotron Character embeddings have been updated on tensorboard!") log("Tacotron training complete after {} global steps!".format( args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log("Exiting due to exception: {}".format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def griffin_lim(mel): """ Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built with the same parameters present in hparams.py. """ return audio.inv_mel_spectrogram(mel, hparams)
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames, embed_filenames): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(",")] assert 0 == len(texts) % self._hparams.tacotron_num_gpus seqs = [ np.asarray(text_to_sequence(text, cleaner_names)) for text in texts ] input_lengths = [len(seq) for seq in seqs] size_per_device = len(seqs) // self._hparams.tacotron_num_gpus #Pad inputs according to each GPU max length input_seqs = None split_infos = [] for i in range(self._hparams.tacotron_num_gpus): device_input = seqs[size_per_device * i:size_per_device * (i + 1)] device_input, max_seq_len = self._prepare_inputs(device_input) input_seqs = np.concatenate( (input_seqs, device_input), axis=1) if input_seqs is not None else device_input split_infos.append([max_seq_len, 0, 0, 0]) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), } if self.gta: np_targets = [ np.load(mel_filename) for mel_filename in mel_filenames ] target_lengths = [len(np_target) for np_target in np_targets] #pad targets according to each GPU max length target_seqs = None for i in range(self._hparams.tacotron_num_gpus): device_target = np_targets[size_per_device * i:size_per_device * (i + 1)] device_target, max_target_len = self._prepare_targets( device_target, self._hparams.outputs_per_step) target_seqs = np.concatenate( (target_seqs, device_target), axis=1) if target_seqs is not None else device_target split_infos[i][ 1] = max_target_len #Not really used but setting it in case for future development maybe? feed_dict[self.targets] = target_seqs assert len(np_targets) == len(texts) feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) feed_dict[self.speaker_embeddings] = [ np.load(f) for f in embed_filenames ] if self.gta or not hparams.predict_linear: mels, alignments, stop_tokens = self.session.run( [ self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (1D arrays) mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] if not self.gta: #Natural batch synthesis #Get Mel lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] assert len(mels) == len(texts) else: linears, mels, alignments, stop_tokens = self.session.run( [ self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (1D arrays) linears = [ linear for gpu_linear in linears for linear in gpu_linear ] mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] #Natural batch synthesis #Get Mel/Linear lengths for the entire batch from stop_tokens predictions # target_lengths = self._get_output_lengths(stop_tokens) target_lengths = [9999] #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] linears = [ linear[:target_length, :] for linear, target_length in zip(linears, target_lengths) ] assert len(mels) == len(linears) == len(texts) if basenames is None: raise NotImplemented() saved_mels_paths = [] for i, mel in enumerate(mels): # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, "mel-{}.npy".format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) if log_dir is not None: #save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join( log_dir, "wavs/wav-{}-mel.wav".format(basenames[i])), sr=hparams.sample_rate) #save alignments plot.plot_alignment(alignments[i], os.path.join( log_dir, "plots/alignment-{}.png".format( basenames[i])), title="{}".format(texts[i]), split_title=True, max_len=target_lengths[i]) #save mel spectrogram plot plot.plot_spectrogram( mel, os.path.join(log_dir, "plots/mel-{}.png".format(basenames[i])), title="{}".format(texts[i]), split_title=True) if hparams.predict_linear: #save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join( log_dir, "wavs/wav-{}-linear.wav".format( basenames[i])), sr=hparams.sample_rate) #save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join( log_dir, "plots/linear-{}.png".format( basenames[i])), title="{}".format(texts[i]), split_title=True, auto_aspect=True) return saved_mels_paths
# The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We # can concatenate the mel spectrograms to a single one. mel = np.concatenate(mels, axis=1) # The vocoder can take a callback function to display the generation. More on that later. For # now we'll simply hide it like this: no_action = lambda *args: None print("\tTesting the vocoder...") # For the sake of making this test short, we'll pass a short target length. The target length # is the length of the wav segments that are processed in parallel. E.g. for audio sampled # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and # that has a detrimental effect on the quality of the audio. The default parameters are # recommended in general. # vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action) generated_wav = audio.inv_mel_spectrogram(mel, hparams.hparams) print("All test passed! You can now synthesize speech.\n\n") ## Interactive speech generation print( "This is a GUI-less example of interface to SV2TTS. The purpose of this script is to " "show how you can interface this project easily with your own. See the source code for " "an explanation of what is happening.\n") print("Interactive generation loop") num_generated = 0 while True: try: # Get the reference audio filepath message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \ "wav, m4a, flac, ...):\n"
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, "taco_pretrained") plot_dir = os.path.join(log_dir, "plots") wav_dir = os.path.join(log_dir, "wavs") mel_dir = os.path.join(log_dir, "mel-spectrograms") eval_dir = os.path.join(log_dir, "eval-dir") eval_plot_dir = os.path.join(eval_dir, "plots") eval_wav_dir = os.path.join(eval_dir, "wavs") tensorboard_dir = os.path.join(log_dir, "tacotron_events") os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt") log("Checkpoint path: {}".format(checkpoint_fpath)) log("Using model: Tacotron") log(hparams_debug_string()) # Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) # Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope("datafeeder") as scope: feeder = Feeder(coord, hparams) # Set up model: global_step = tf.Variable(0, name="global_step", trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) #eval_model = model_test_mode(args, feeder, hparams, global_step) # Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=2) log("Tacotron training set to a maximum of {} steps".format( args.tacotron_train_steps)) # Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True # Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) # saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if checkpoint_state and checkpoint_state.model_checkpoint_path: log("Loading checkpoint {}".format( checkpoint_state.model_checkpoint_path), slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log("No model to load at {}".format(save_dir), slack=True) saver.save(sess, checkpoint_fpath, global_step=global_step) except tf.errors.OutOfRangeError as e: log("Cannot restore checkpoint: {}".format(e), slack=True) else: log("Starting new training!", slack=True) saver.save(sess, checkpoint_fpath, global_step=global_step) # initializing feeder feeder.start_threads(sess) print("Feeder is intialized and model is ready to train.......") # Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]".format( step, time_window.average, loss, loss_window.average) log(message, end="\r", slack=(step % args.checkpoint_interval == 0)) print(message) if loss > 100 or np.isnan(loss): log("Loss exploded to {:.5f} at step {}".format( loss, step)) raise Exception("Loss exploded") if step % args.summary_interval == 0: log("\nWriting summary at step {}".format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: pass if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \ step == 300: # Save model and current global step saver.save(sess, checkpoint_fpath, global_step=global_step) log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform.." ) input_seq, mel_prediction, alignment, target, target_length = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], ]) # save predicted mel spectrogram to disk (debug) mel_filename = "mel-prediction-step-{}.npy".format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav( wav, os.path.join(wav_dir, "step-{}-wave-from-mel.wav".format(step)), sr=hparams.sample_rate) # save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, "step-{}-align.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) # save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join( plot_dir, "step-{}-mel-spectrogram.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, loss), target_spectrogram=target, max_len=target_length) if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: # Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) log("Tacotron training complete after {} global steps!".format( args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log("Exiting due to exception: {}".format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def griffin_lim(mel): return audio.inv_mel_spectrogram(mel, hparams)
def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool, save_every: int, backup_every: int, force_restart: bool): # Check to make sure the hop length is correctly factorised # assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length # Instantiate the model print("Initializing the model...") # model = WaveRNN( # rnn_dims=hp.voc_rnn_dims, # fc_dims=hp.voc_fc_dims, # bits=hp.bits, # pad=hp.voc_pad, # upsample_factors=hp.voc_upsample_factors, # feat_dims=hp.num_mels, # compute_dims=hp.voc_compute_dims, # res_out_dims=hp.voc_res_out_dims, # res_blocks=hp.voc_res_blocks, # hop_length=hp.hop_length, # sample_rate=hp.sample_rate, # mode=hp.voc_mode # ).cuda() model = model_VC(32, 256, 512, 32).cuda() # Initialize the optimizer optimizer = optim.Adam(model.parameters()) for p in optimizer.param_groups: p["lr"] = hp.voc_lr loss_recon = nn.MSELoss() loss_content = nn.L1Loss() # Load the weights model_dir = models_dir.joinpath(run_id) model_dir.mkdir(exist_ok=True) weights_fpath = model_dir.joinpath(run_id + ".pt") if force_restart or not weights_fpath.exists(): print("\nStarting the training of AutoVC from scratch\n") model.save(weights_fpath, optimizer) else: print("\nLoading weights at %s" % weights_fpath) model.load(weights_fpath, optimizer) print("AutoVC weights loaded from step %d" % model.step) # Initialize the dataset metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \ voc_dir.joinpath("synthesized.txt") mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath( "mels_gta") wav_dir = syn_dir.joinpath("audio") #2019.11.26 embed_dir = syn_dir.joinpath("embeds") dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir, embed_dir) test_loader = DataLoader(dataset, batch_size=1, shuffle=True, pin_memory=True) # Begin the training simple_table([('Batch size', hp.voc_batch_size), ('LR', hp.voc_lr), ('Sequence Len', hp.voc_seq_len)]) for epoch in range(1, 350): model.train() data_loader = DataLoader(dataset, collate_fn=collate_vocoder, batch_size=hp.voc_batch_size, num_workers=2, shuffle=True, pin_memory=True) start = time.time() running_loss = 0. for i, (m, e, _) in enumerate(data_loader, 1): #print("e:",e.shape) #print("m:",m.shape) model.train() m, e = m.cuda(), e.cuda() # Forward pass C, X_C, X_before, X_after, _ = model(m, e, e) #c_org shape: torch.Size([100, 256, 1]) #x shape: torch.Size([100, 80, 544]) #c_org_expand shape torch.Size([100, 256, 544]) #encoder_outputs shape: torch.Size([100, 544, 320]) #C shape: torch.Size([100, 544, 64]) #X shape: torch.Size([100, 1, 544, 80]) X_after = X_after.squeeze(1).permute(0, 2, 1) X_before = X_before.squeeze(1).permute(0, 2, 1) #print("C shape:",C.shape) #if X_C: # print("X_C shape:",X_C.shape) #print("X shape:",X.shape) # Backward pass loss_rec_before = loss_recon(X_before, m) loss_rec_after = loss_recon(X_after, m) loss_c = loss_content(C, X_C) loss = loss_rec_before + loss_rec_after + loss_c #print("recon loss:",loss1) #print("content loss:",loss2) optimizer.zero_grad() loss.backward() optimizer.step() #print("loss:",loss.item()) running_loss += loss.item() #print("running loss:",running_loss) speed = i / (time.time() - start) avg_loss = running_loss / i #print("avg_loss:",avg_loss) step = model.get_step() if hp.decay_learning_rate == True: p["lr"] = _learning_rate_decay(p["lr"], step) k = step // 1000 if step % 100 == 0 and step != 0: model.eval() plt.figure(1) C, X_C, X_before, X_after, _ = model(m, e, e) X_after = X_after.squeeze(1).permute(0, 2, 1) mel_out = torch.tensor(X_after).clone().detach().cpu().numpy() from synthesizer import audio from synthesizer.hparams import hparams wav = audio.inv_mel_spectrogram(mel_out[0, :, :], hparams) librosa.output.write_wav("out.wav", np.float32(wav), hparams.sample_rate) mel_out = mel_out[0, :, :].transpose(1, 0) plt.imshow(mel_out.T, interpolation='nearest', aspect='auto') plt.title("Generate Spectrogram") save_path = model_dir p_path = save_path.joinpath("generate.png") plt.savefig(p_path) plt.figure(2) m_out = m.squeeze(1).permute(0, 2, 1) m_out = torch.tensor(m).clone().detach().cpu().numpy() m_out = m_out[0, :, :].transpose(1, 0) plt.imshow(m_out.T, interpolation='nearest', aspect='auto') plt.title("Orignal Spectrogram") o_path = save_path.joinpath("orignal.png") plt.savefig(o_path) if backup_every != 0 and step % backup_every == 0: model.checkpoint(model_dir, optimizer) if save_every != 0 and step % save_every == 0: model.save(weights_fpath, optimizer) torch.save(model, "model_ttsdb_48_48.pkl") msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \ f"Loss: {avg_loss:.4f} | {speed:.1f} " \ f"steps/s | Step: {k}k | " stream(msg) # gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched,hp.voc_target,model_dir) print("")
mel = torch.from_numpy(mel[None, ...]) embedding_tr = embedding_tr[np.newaxis, :, np.newaxis] embedding_tr =torch.tensor(embedding_tr) mel,e1,embedding_tr = mel.cuda(),e1.cuda(),embedding_tr.cuda() #print("mel shape:",mel.shape) #print("e1 shape:",e1.shape) #print("e2 shape:",e2.shape) C,X_C,X_before,X_after,_ = model(mel, e1, embedding_tr) mel_out = torch.tensor(X_after).clone().detach().cpu().numpy() #print("mel_out shape:",mel_out.shape) if use_wavrnn: wav = vocoder_wavrnn.infer_waveform(mel_out[0,0,:,:].T) else: wav = audio.inv_mel_spectrogram(mel_out[0,0,:,:].T, hparams) wav = librosa.resample(wav,16000,24000) out_dir="/data/VCTK/out_v5/vcc2020-teams:00004/" if not os.path.exists(out_dir): os.mkdir(out_dir) fname = t +"_"+ s +"_"+name[:-4]+".wav" out_dir_fpath = out_dir+"/"+fname librosa.output.write_wav(out_dir_fpath, wav.astype(np.float32),24000) print("write:{}".format(fname))