示例#1
0
    def load_pd(self, pd_file_path):
        # pd_file_path: os.path.join(input_dir,'optimized_frozen_tacotron_o.pb')
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())

        graph_def = tf.GraphDef()
        with gfile.FastGFile(pd_file_path, 'rb') as fin:
            graph_def.ParseFromString(fin.read())
            tf.import_graph_def(graph_def, name='')
        self._hp = hparams

        self._pad = 0
        if self._hp.GL_on_GPU:
            # self.GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, self._hp.num_mels), name='GLGPU_mel_inputs')
            self.GLGPU_lin_inputs = tf.placeholder(tf.float32,
                                                   (None, self._hp.num_freq),
                                                   name='GLGPU_lin_inputs')
            # self.GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(self.GLGPU_mel_inputs, self._hp)
            self.GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(
                self.GLGPU_lin_inputs, self._hp)

        output_node_name = 'Tacotron_model/inference/cbhg_linear_specs_projection/projection_cbhg_linear_specs_projection/BiasAdd:0'
        self.linear_outputs = self.sess.graph.get_tensor_by_name(
            output_node_name)

        self.inputs = self.sess.graph.get_tensor_by_name('inputs:0')
        self.input_lengths = self.sess.graph.get_tensor_by_name(
            'input_lengths:0')
        self.split_infos = self.sess.graph.get_tensor_by_name('split_infos:0')
        self.p_inputs = tf.py_func(split_func,
                                   [self.inputs, self.split_infos[:, 0]],
                                   tf.int32)
示例#2
0
	def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
		log('Constructing model: %s' % model_name)
		#Force the batch size to be known in order to use attention masking in batch synthesis
		inputs = tf.placeholder(tf.int32, (None, None), name='inputs')
		input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths')
		targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets')
		split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos')
		with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
			self.model = create_model(model_name, hparams)
			if gta:
				self.model.initialize(inputs, input_lengths, targets, gta=gta, split_infos=split_infos)
			else:
				self.model.initialize(inputs, input_lengths, split_infos=split_infos)

			self.mel_outputs = self.model.tower_mel_outputs
			self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None
			self.alignments = self.model.tower_alignments
			self.stop_token_prediction = self.model.tower_stop_token_prediction
			self.targets = targets

		if hparams.GL_on_GPU:
			self.GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs')
			self.GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs')

			self.GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(self.GLGPU_mel_inputs, hparams)
			self.GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(self.GLGPU_lin_inputs, hparams)

		self.gta = gta
		self._hparams = hparams
		#pad input sequences with the <pad_token> 0 ( _ )
		self._pad = 0
		#explicitely setting the padding to a value that doesn't originally exist in the spectogram
		#to avoid any possible conflicts, without affecting the output range of the model too much
		if hparams.symmetric_mels:
			self._target_pad = -hparams.max_abs_value
		else:
			self._target_pad = 0.

		self.inputs = inputs
		self.input_lengths = input_lengths
		self.targets = targets
		self.split_infos = split_infos

		log('Loading checkpoint: %s' % checkpoint_path)
		#Memory allocation on the GPUs as needed
		config = tf.ConfigProto()
		config.gpu_options.allow_growth = True
		config.allow_soft_placement = True

		self.session = tf.Session(config=config)
		self.session.run(tf.global_variables_initializer())

		saver = tf.train.Saver()
		saver.restore(self.session, checkpoint_path)
示例#3
0
    def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron2', freezer=False):
        log('Constructing model: %s' % model_name)
        if freezer:
            try:
                checkpoint_path = tf.train.get_checkpoint_state(checkpoint_path).model_checkpoint_path
            except:
                raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint_path))
        # Force the batch size to be known in order to use attention masking in batch synthesis
        inputs = tf.placeholder(tf.int32, (None, None), name='inputs')
        input_lengths = tf.placeholder(tf.int32, (None,), name='input_lengths')
        targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets')
        with tf.variable_scope('model', reuse=tf.AUTO_REUSE):
            self.model = create_model(hparams)
            if gta:
                self.model.initialize(inputs, input_lengths, targets, gta=gta)
            else:
                self.model.initialize(inputs, input_lengths)

            self.mel_outputs = self.model.mel_outputs
            self.linear_outputs = self.model.linear_outputs if (hparams.predict_linear and not gta) else None
            if freezer:
                self.alignments = tf.identity(self.model.alignments, name="alignments")[0]
                self.linear_outputs = inv_linear_spectrogram_tensorflow(self.model.linear_outputs[0], hparams=hparams)
            else:
                self.alignments = tf.identity(self.model.alignments, name="alignments")
            self.stop_token_prediction = self.model.stop_token_prediction
            self.targets = targets

        self.gta = gta
        self._hparams = hparams
        # pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0
        # explicitely setting the padding to a value that doesn't originally exist in the spectogram
        # to avoid any possible conflicts, without affecting the output range of the model too much
        if hparams.symmetric_mels:
            self._target_pad = -hparams.max_abs_value
        else:
            self._target_pad = 0.

        self.inputs = inputs
        self.input_lengths = input_lengths
        self.targets = targets

        log('Loading checkpoint: %s' % checkpoint_path)
        # Memory allocation on the GPUs as needed
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(self.session, checkpoint_path)
示例#4
0
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, 'taco_pretrained')
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    mel_dir = os.path.join(log_dir, 'mel-spectrograms')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    tensorboard_dir = os.path.join(log_dir, 'tacotron_events')
    meta_folder = os.path.join(log_dir, 'metas')
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
    input_path = os.path.join(args.base_dir, args.tacotron_input)

    if hparams.predict_linear:
        linear_dir = os.path.join(log_dir, 'linear-spectrograms')
        os.makedirs(linear_dir, exist_ok=True)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))
    log(hparams_debug_string())

    #Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    #Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, input_path, hparams)

    #Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    eval_model = model_test_mode(args, feeder, hparams, global_step)

    #Embeddings metadata
    char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv')
    if not os.path.isfile(char_embedding_meta):
        with open(char_embedding_meta, 'w', encoding='utf-8') as f:
            for symbol in symbols:
                if symbol == ' ':
                    symbol = '\\s'  #For visual purposes, swap space with \s

                f.write('{}\n'.format(symbol))

    char_embedding_meta = char_embedding_meta.replace(log_dir, '..')

    #Potential Griffin-Lim GPU setup
    if hparams.GL_on_GPU:
        GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels),
                                          name='GLGPU_mel_inputs')
        GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq),
                                          name='GLGPU_lin_inputs')

        GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(
            GLGPU_mel_inputs, hparams)
        GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(
            GLGPU_lin_inputs, hparams)

    #Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=20)

    log('Tacotron training set to a maximum of {} steps'.format(
        args.tacotron_train_steps))

    #Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    #Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)

            sess.run(tf.global_variables_initializer())

            #saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    if (checkpoint_state
                            and checkpoint_state.model_checkpoint_path):
                        log('Loading checkpoint {}'.format(
                            checkpoint_state.model_checkpoint_path),
                            slack=True)
                        ckpt = tf.train.load_checkpoint(
                            checkpoint_state.model_checkpoint_path)
                        variables = list(
                            ckpt.get_variable_to_shape_map().keys())
                        #print('=====================PRINTING VARS===============================')
                        #print(variables)
                        #drop_source_layers = ['Tacotron_model/inference/inputs_embedding','Tacotron_model/Tacotron_model/inference/inputs_embedding/Adam_1','Tacotron_model/Tacotron_model/inference/inputs_embedding/Adam']
                        #for v in tf.global_variables():
                        #	if not any(layer in v.op.name for layer in drop_source_layers):
                        #		print('Loading', v.op.name)
                        #		v.load(ckpt.get_tensor(v.op.name), session=sess)

                        # Initialize all variables needed for DS, but not loaded from ckpt
                        #init_op = tf.variables_initializer([v for v in tf.global_variables() if any(layer in v.op.name for layer in drop_source_layers)])
                        #sess.run(init_op)
                        saver.restore(sess,
                                      checkpoint_state.model_checkpoint_path)

                    else:
                        log('No model to load at {}'.format(save_dir),
                            slack=True)
                        saver.save(sess,
                                   checkpoint_path,
                                   global_step=global_step)

                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e), slack=True)
            else:
                log('Starting new training!', slack=True)
                saver.save(sess, checkpoint_path, global_step=global_step)

            #initializing feeder
            feeder.start_threads(sess)

            #Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
                    step, time_window.average, loss, loss_window.average)
                log(message,
                    end='\r',
                    slack=(step % args.checkpoint_interval == 0))

                if np.isnan(loss):
                    log('Loss exploded to {:.5f} at step {}'.format(
                        loss, step))
                    raise Exception('Loss exploded')

                if step % args.summary_interval == 0:
                    log('\nWriting summary at step {}'.format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.eval_interval == 0:
                    #Run eval and save eval stats
                    log('\nRunning evaluation at step {}'.format(step))

                    eval_losses = []
                    before_losses = []
                    after_losses = []
                    stop_token_losses = []
                    linear_losses = []
                    linear_loss = None

                    if hparams.predict_linear:
                        for i in tqdm(range(feeder.test_steps)):
                            eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run(
                                [
                                    eval_model.tower_loss[0],
                                    eval_model.tower_before_loss[0],
                                    eval_model.tower_after_loss[0],
                                    eval_model.tower_stop_token_loss[0],
                                    eval_model.tower_linear_loss[0],
                                    eval_model.tower_mel_outputs[0][0],
                                    eval_model.tower_mel_targets[0][0],
                                    eval_model.tower_targets_lengths[0][0],
                                    eval_model.tower_alignments[0][0],
                                    eval_model.tower_linear_outputs[0][0],
                                    eval_model.tower_linear_targets[0][0],
                                ])
                            eval_losses.append(eloss)
                            before_losses.append(before_loss)
                            after_losses.append(after_loss)
                            stop_token_losses.append(stop_token_loss)
                            linear_losses.append(linear_loss)
                        linear_loss = sum(linear_losses) / len(linear_losses)

                        if hparams.GL_on_GPU:
                            wav = sess.run(GLGPU_lin_outputs,
                                           feed_dict={GLGPU_lin_inputs: lin_p})
                            wav = audio.inv_preemphasis(
                                wav, hparams.preemphasis, hparams.preemphasize)
                        else:
                            wav = audio.inv_linear_spectrogram(
                                lin_p.T, hparams)
                        audio.save_wav(
                            wav,
                            os.path.join(
                                eval_wav_dir,
                                'step-{}-eval-wave-from-linear.wav'.format(
                                    step)),
                            sr=hparams.sample_rate)

                    else:
                        for i in tqdm(range(feeder.test_steps)):
                            eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run(
                                [
                                    eval_model.tower_loss[0],
                                    eval_model.tower_before_loss[0],
                                    eval_model.tower_after_loss[0],
                                    eval_model.tower_stop_token_loss[0],
                                    eval_model.tower_mel_outputs[0][0],
                                    eval_model.tower_mel_targets[0][0],
                                    eval_model.tower_targets_lengths[0][0],
                                    eval_model.tower_alignments[0][0]
                                ])
                            eval_losses.append(eloss)
                            before_losses.append(before_loss)
                            after_losses.append(after_loss)
                            stop_token_losses.append(stop_token_loss)

                    eval_loss = sum(eval_losses) / len(eval_losses)
                    before_loss = sum(before_losses) / len(before_losses)
                    after_loss = sum(after_losses) / len(after_losses)
                    stop_token_loss = sum(stop_token_losses) / len(
                        stop_token_losses)

                    log('Saving eval log to {}..'.format(eval_dir))
                    #Save some log to monitor model improvement on same unseen sequence
                    if hparams.GL_on_GPU:
                        wav = sess.run(GLGPU_mel_outputs,
                                       feed_dict={GLGPU_mel_inputs: mel_p})
                        wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                                    hparams.preemphasize)
                    else:
                        wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(
                            eval_wav_dir,
                            'step-{}-eval-wave-from-mel.wav'.format(step)),
                        sr=hparams.sample_rate)

                    plot.plot_alignment(
                        align,
                        os.path.join(eval_plot_dir,
                                     'step-{}-eval-align.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, eval_loss),
                        max_len=t_len // hparams.outputs_per_step)
                    plot.plot_spectrogram(
                        mel_p,
                        os.path.join(
                            eval_plot_dir,
                            'step-{}-eval-mel-spectrogram.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, eval_loss),
                        target_spectrogram=mel_t,
                        max_len=t_len)

                    if hparams.predict_linear:
                        plot.plot_spectrogram(
                            lin_p,
                            os.path.join(
                                eval_plot_dir,
                                'step-{}-eval-linear-spectrogram.png'.format(
                                    step)),
                            title='{}, {}, step={}, loss={:.5f}'.format(
                                args.model, time_string(), step, eval_loss),
                            target_spectrogram=lin_t,
                            max_len=t_len,
                            auto_aspect=True)

                    log('Eval loss for global step {}: {:.3f}'.format(
                        step, eval_loss))
                    log('Writing eval summary!')
                    add_eval_stats(summary_writer, step, linear_loss,
                                   before_loss, after_loss, stop_token_loss,
                                   eval_loss)

                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300:
                    #Save model and current global step
                    saver.save(sess, checkpoint_path, global_step=global_step)

                    log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..'
                        )
                    if hparams.predict_linear:
                        input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run(
                            [
                                model.tower_inputs[0][0],
                                model.tower_mel_outputs[0][0],
                                model.tower_linear_outputs[0][0],
                                model.tower_alignments[0][0],
                                model.tower_mel_targets[0][0],
                                model.tower_targets_lengths[0][0],
                                model.tower_linear_targets[0][0],
                            ])

                        #save predicted linear spectrogram to disk (debug)
                        linear_filename = 'linear-prediction-step-{}.npy'.format(
                            step)
                        np.save(os.path.join(linear_dir, linear_filename),
                                linear_prediction.T,
                                allow_pickle=False)

                        #save griffin lim inverted wav for debug (linear -> wav)
                        if hparams.GL_on_GPU:
                            wav = sess.run(GLGPU_lin_outputs,
                                           feed_dict={
                                               GLGPU_lin_inputs:
                                               linear_prediction
                                           })
                            wav = audio.inv_preemphasis(
                                wav, hparams.preemphasis, hparams.preemphasize)
                        else:
                            wav = audio.inv_linear_spectrogram(
                                linear_prediction.T, hparams)
                        audio.save_wav(
                            wav,
                            os.path.join(
                                wav_dir,
                                'step-{}-wave-from-linear.wav'.format(step)),
                            sr=hparams.sample_rate)

                        #Save real and predicted linear-spectrogram plot to disk (control purposes)
                        plot.plot_spectrogram(
                            linear_prediction,
                            os.path.join(
                                plot_dir,
                                'step-{}-linear-spectrogram.png'.format(step)),
                            title='{}, {}, step={}, loss={:.5f}'.format(
                                args.model, time_string(), step, loss),
                            target_spectrogram=linear_target,
                            max_len=target_length,
                            auto_aspect=True)

                    else:
                        input_seq, mel_prediction, alignment, target, target_length = sess.run(
                            [
                                model.tower_inputs[0][0],
                                model.tower_mel_outputs[0][0],
                                model.tower_alignments[0][0],
                                model.tower_mel_targets[0][0],
                                model.tower_targets_lengths[0][0],
                            ])

                    #save predicted mel spectrogram to disk (debug)
                    mel_filename = 'mel-prediction-step-{}.npy'.format(step)
                    np.save(os.path.join(mel_dir, mel_filename),
                            mel_prediction.T,
                            allow_pickle=False)

                    #save griffin lim inverted wav for debug (mel -> wav)
                    if hparams.GL_on_GPU:
                        wav = sess.run(
                            GLGPU_mel_outputs,
                            feed_dict={GLGPU_mel_inputs: mel_prediction})
                        wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                                    hparams.preemphasize)
                    else:
                        wav = audio.inv_mel_spectrogram(
                            mel_prediction.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(wav_dir,
                                     'step-{}-wave-from-mel.wav'.format(step)),
                        sr=hparams.sample_rate)

                    #save alignment plot to disk (control purposes)
                    plot.plot_alignment(
                        alignment,
                        os.path.join(plot_dir,
                                     'step-{}-align.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, loss),
                        max_len=target_length // hparams.outputs_per_step)
                    #save real and predicted mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        mel_prediction,
                        os.path.join(
                            plot_dir,
                            'step-{}-mel-spectrogram.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, loss),
                        target_spectrogram=target,
                        max_len=target_length)
                    log('Input at step {}: {}'.format(
                        step, sequence_to_text(input_seq)))

                if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
                    #Get current checkpoint state
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    #Update Projector
                    log('\nSaving Model Character Embeddings visualization..')
                    add_embedding_stats(summary_writer,
                                        [model.embedding_table.name],
                                        [char_embedding_meta],
                                        checkpoint_state.model_checkpoint_path)
                    log('Tacotron Character embeddings have been updated on tensorboard!'
                        )

            log('Tacotron training complete after {} global steps!'.format(
                args.tacotron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log('Exiting due to exception: {}'.format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)
示例#5
0
	def load(self, checkpoint_path, hparams, gta=False, vae_code_mode='auto', model_name='Tacotron'):
		log('Constructing model: %s' % model_name)
		#Force the batch size to be known in order to use attention masking in batch synthesis
		inputs = tf.placeholder(tf.int32, (None, None), name='inputs')
		input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths')
		targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets')
		lengths = tf.placeholder(tf.float32, (None), name='target_lengths')
		mel_references = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_references')
		references_lengths = tf.placeholder(tf.float32, (None), name='reference_lengths')
		vae_codes = tf.placeholder(tf.float32, (None, hparams.vae_dim), name='vae_codes')
		split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos')
		with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
			self.model = create_model(model_name, hparams)
			if gta:
				if hparams.use_vae:
					#Generate vae_code by Gaussian sampling given the mean and variance, which are generated by the VAE network given the mel_targets. Used in GTA synthesis mode.
					self.model.initialize(inputs, input_lengths, mel_targets=targets, targets_lengths=lengths, gta=gta, use_vae=True, split_infos=split_infos)
				else:
					self.model.initialize(inputs, input_lengths, mel_targets=targets, gta=gta, split_infos=split_infos)
			else:
				if hparams.use_vae:
					if vae_code_mode == 'auto':
						#To generate vae_code by Gaussian sampling given the mean and variance, which are generated by the VAE network given the mel_references. Used in natural synthesis mode without args.modify_vae_dim specified.
						self.model.initialize(inputs, input_lengths, mel_references=mel_references, references_lengths=references_lengths, gta=gta, use_vae=True, split_infos=split_infos)
					elif vae_code_mode == 'feed':
						#Directly feed in specified vae_code into the Tacotron decoder network while the VAE network are not used. Used in eval mode when mel_reference is not given, no matter args.modify_vae_dim is specified or not.
						self.model.initialize(inputs, input_lengths, vae_codes=vae_codes, gta=gta, use_vae=True, split_infos=split_infos)
					elif vae_code_mode == 'modify':
						#Directly use the mean generated by the VAE network as vae_code, but with some modification according to the variance. Used in natural synthesis mode with args.modify_vae_dim specified.
						self.model.initialize(inputs, input_lengths, mel_references=mel_references, references_lengths=references_lengths, vae_codes=vae_codes, gta=gta, use_vae=True, split_infos=split_infos)
					elif vae_code_mode == 'inference':
						#To get the vae_code(mean) generated by the VAE given the mel_references. Useful when you wish to check the quality of your VAE latent embedding
						self.model.initialize(mel_references=mel_references, references_lengths=references_lengths, gta=gta, use_vae=True, split_infos=split_infos)
				else:        
					self.model.initialize(inputs, input_lengths, gta=gta, split_infos=split_infos)

			self.mu = self.model.tower_mu
			self.log_var = self.model.tower_log_var
			self.mel_outputs = self.model.tower_mel_outputs
			self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear) else None
			self.alignments = self.model.tower_alignments
			self.stop_token_prediction = self.model.tower_stop_token_prediction

		if hparams.GL_on_GPU:
			self.GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs')
			self.GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs')

			self.GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(self.GLGPU_mel_inputs, hparams)
			self.GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(self.GLGPU_lin_inputs, hparams)

		self.gta = gta
		#force feeding vae codes into the tacotron decoder(for eval mode) or generating the vae codes from the reference mel spectrograms
		self.vae_code_mode = vae_code_mode
		self._hparams = hparams
		#pad input sequences with the <pad_token> 0 ( _ )
		self._pad = 0
		#explicitely setting the padding to a value that doesn't originally exist in the spectogram
		#to avoid any possible conflicts, without affecting the output range of the model too much
		if hparams.symmetric_mels:
			self._target_pad = -hparams.max_abs_value
		else:
			self._target_pad = 0.

		self.inputs = inputs
		self.input_lengths = input_lengths
		self.targets = targets
		self.lengths = lengths
		self.vae_codes = vae_codes
		self.mel_references = mel_references
		self.references_lengths = references_lengths
		self.split_infos = split_infos

		log('Loading checkpoint: %s' % checkpoint_path)
		#Memory allocation on the GPUs as needed
		config = tf.ConfigProto()
		config.gpu_options.allow_growth = True
		config.allow_soft_placement = True

		self.session = tf.Session(config=config)
		self.session.run(tf.global_variables_initializer())

		saver = tf.train.Saver()
		saver.restore(self.session, checkpoint_path)
示例#6
0
    def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
        log('Constructing model: %s' % model_name)
        #Force the batch size to be known in order to use attention masking in batch synthesis
        inputs = tf.placeholder(tf.int32, (None, None), name='inputs')
        input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths')
        targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels),
                                 name='mel_targets')
        split_infos = tf.placeholder(tf.int32,
                                     shape=(hparams.tacotron_num_gpus, None),
                                     name='split_infos')
        with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
            self.model = create_model(model_name, hparams)
            if gta:
                self.model.initialize(inputs,
                                      input_lengths,
                                      targets,
                                      gta=gta,
                                      split_infos=split_infos)
            else:
                self.model.initialize(inputs,
                                      input_lengths,
                                      split_infos=split_infos)

            self.mel_outputs = self.model.tower_mel_outputs
            self.linear_outputs = self.model.tower_linear_outputs if (
                hparams.predict_linear and not gta) else None
            self.alignments = self.model.tower_alignments
            self.stop_token_prediction = self.model.tower_stop_token_prediction
            self.targets = targets

        if hparams.GL_on_GPU:
            self.GLGPU_mel_inputs = tf.placeholder(tf.float32,
                                                   (None, hparams.num_mels),
                                                   name='GLGPU_mel_inputs')
            self.GLGPU_lin_inputs = tf.placeholder(tf.float32,
                                                   (None, hparams.num_freq),
                                                   name='GLGPU_lin_inputs')

            self.GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(
                self.GLGPU_mel_inputs, hparams)
            self.GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(
                self.GLGPU_lin_inputs, hparams)

        self.gta = gta
        self._hparams = hparams
        #pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0
        #explicitely setting the padding to a value that doesn't originally exist in the spectogram
        #to avoid any possible conflicts, without affecting the output range of the model too much
        if hparams.symmetric_mels:
            self._target_pad = -hparams.max_abs_value
        else:
            self._target_pad = 0.

        self.inputs = inputs
        self.input_lengths = input_lengths
        self.targets = targets
        self.split_infos = split_infos
        OLD_CHECKPOINT_FILE = checkpoint_path
        NEW_CHECKPOINT_FILE = 'logs-Tacotron/taco_pretrained/new_model.ckpt-189500'
        log('Loading checkpoint: %s' % checkpoint_path)
        #UPDATE CHECKPOINT FILE VARS
        vars_to_rename = {
            "Tacotron_model/inference/CBHG_postnet/CBHG_postnet_highwaynet_1/H/biases":
            "Tacotron_model/inference/CBHG_postnet/CBHG_postnet_highwaynet_1/H/bias",
        }
        new_checkpoint_vars = {}
        reader = tf.train.NewCheckpointReader(OLD_CHECKPOINT_FILE)
        for old_name in reader.get_variable_to_shape_map():
            if old_name in vars_to_rename:
                new_name = vars_to_rename[old_name]
            else:
                new_name = old_name
            new_checkpoint_vars[new_name] = tf.Variable(
                reader.get_tensor(old_name))
        #Memory allocation on the GPUs as needed
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())

        saver = tf.train.Saver(new_checkpoint_vars)
        saver.restore(self.session, checkpoint_path)
示例#7
0
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, 'taco_pretrained')
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    mel_dir = os.path.join(log_dir, 'mel-spectrograms')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    tensorboard_dir = os.path.join(log_dir, 'tacotron_events')
    meta_folder = os.path.join(log_dir, 'metas')
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
    input_path = os.path.join(args.base_dir, args.tacotron_input)

    if hparams.predict_linear:
        linear_dir = os.path.join(log_dir, 'linear-spectrograms')
        os.makedirs(linear_dir, exist_ok=True)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))
    log(hparams_debug_string())

    #Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    #Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, input_path, hparams, args)

    #Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    eval_model = model_test_mode(args, hparams, model)
    # if args.TEST:
    # 	for v in tf.global_variables():
    # 		print(v)

    #Embeddings metadata
    char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv')
    if not os.path.isfile(char_embedding_meta):
        with open(char_embedding_meta, 'w', encoding='utf-8') as f:
            for symbol in symbols:
                if symbol == ' ':
                    symbol = '\\s'  #For visual purposes, swap space with \s

                f.write('{}\n'.format(symbol))

    char_embedding_meta = char_embedding_meta.replace(log_dir, '..')

    #Potential Griffin-Lim GPU setup
    if hparams.GL_on_GPU:
        GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels),
                                          name='GLGPU_mel_inputs')
        GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq),
                                          name='GLGPU_lin_inputs')

        GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(
            GLGPU_mel_inputs, hparams)
        GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(
            GLGPU_lin_inputs, hparams)

    #Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    loss_bef_window = ValueWindow(100)
    loss_aft_window = ValueWindow(100)
    loss_stop_window = ValueWindow(100)
    loss_reg_window = ValueWindow(100)
    loss_emt_window = ValueWindow(100)
    loss_spk_window = ValueWindow(100)
    loss_orthog_window = ValueWindow(100)
    loss_up_emt_window = ValueWindow(100)
    loss_up_spk_window = ValueWindow(100)
    loss_mo_up_emt_window = ValueWindow(100)
    loss_mo_up_spk_window = ValueWindow(100)
    if args.nat_gan:
        d_loss_t_window = ValueWindow(100)
        d_loss_p_window = ValueWindow(100)
        d_loss_up_window = ValueWindow(100)
        g_loss_p_window = ValueWindow(100)
        g_loss_up_window = ValueWindow(100)

    saver = tf.train.Saver(max_to_keep=args.max_to_keep)

    if args.opt_ref_no_mo and not (args.restart_optimizer_r):
        print(
            "WILL ATTEMPT TO RESTORE OPTIMIZER R - SET ARGS.RESTART_OPTIMIZER_R IF RETRAINING A MODEL THAT DIDN'T HAVE THE OPTIMIZER R"
        )

    assert (not (args.restart_nat_gan_d and args.restore_nat_gan_d_sep))

    var_list = tf.global_variables()
    var_list = [v for v in var_list if not ('pretrained' in v.name)]
    var_list = [
        v for v in var_list
        if not ('nat_gan' in v.name or 'optimizer_n' in v.name)
    ] if (args.restart_nat_gan_d or args.restore_nat_gan_d_sep) else var_list
    var_list = [
        v for v in var_list
        if not ('optimizer_r' in v.name or 'optimizer_3' in v.name)
    ] if args.restart_optimizer_r else var_list
    saver_restore = tf.train.Saver(var_list=var_list)

    if args.unpaired and args.pretrained_emb_disc:
        saver_restore_emt_disc = tf.train.Saver(var_list=[
            v for v in tf.global_variables()
            if ('pretrained_ref_enc_emt' in v.name)
        ])
        saver_restore_spk_disc = tf.train.Saver(var_list=[
            v for v in tf.global_variables()
            if ('pretrained_ref_enc_spk' in v.name)
        ])
    elif args.unpaired and args.pretrained_emb_disc_all:
        saver_restore_emt_disc = tf.train.Saver(var_list=[
            v for v in tf.global_variables() if ('refnet_emt' in v.name)
        ])
        saver_restore_spk_disc = tf.train.Saver(var_list=[
            v for v in tf.global_variables() if ('refnet_spk' in v.name)
        ])

    if args.nat_gan:
        saver_nat_gan = tf.train.Saver(var_list=[
            v for v in tf.global_variables()
            if ('nat_gan' in v.name or 'optimizer_n' in v.name)
        ])
        save_dir_nat_gan = r'nat_gan/pretrained_model'

    log('Tacotron training set to a maximum of {} steps'.format(
        args.tacotron_train_steps))
    if hparams.tacotron_fine_tuning:
        print('FINE TUNING SET TO TRUE - MAKE SURE THIS IS WHAT YOU WANT!')

    #Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    eval_feed_dict, emt_labels, spk_labels, \
    basenames, basenames_refs = get_eval_feed_dict(hparams, args.synth_metadata_filename,
                  eval_model, args.input_dir, args.flip_spk_emt)

    #Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
            # for x in tf.global_variables():
            # 	print(x)

            sess.run(tf.global_variables_initializer())
            #saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    if (checkpoint_state
                            and checkpoint_state.model_checkpoint_path):
                        log('Loading checkpoint {}'.format(
                            checkpoint_state.model_checkpoint_path),
                            slack=True)
                        saver_restore.restore(
                            sess, checkpoint_state.model_checkpoint_path)

                    else:
                        raise ValueError(
                            'No model to load at {}'.format(save_dir))

                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e), slack=True)
            else:
                log('Starting new training!', slack=True)
                saver.save(sess, checkpoint_path, global_step=global_step)

            if args.unpaired and (args.pretrained_emb_disc
                                  or args.pretrained_emb_disc_all):
                save_dir_emt = r'spk_disc/pretrained_model_emt_disc'
                checkpoint_state_emt = tf.train.get_checkpoint_state(
                    save_dir_emt)
                saver_restore_emt_disc.restore(
                    sess, checkpoint_state_emt.model_checkpoint_path)
                log('Loaded Emotion Discriminator from checkpoint {}'.format(
                    checkpoint_state_emt.model_checkpoint_path),
                    slack=True)

                save_dir_spk = r'spk_disc/pretrained_model_spk_disc'
                checkpoint_state_spk = tf.train.get_checkpoint_state(
                    save_dir_spk)
                saver_restore_spk_disc.restore(
                    sess, checkpoint_state_spk.model_checkpoint_path)
                log('Loaded Speaker Discriminator from checkpoint {}'.format(
                    checkpoint_state_spk.model_checkpoint_path),
                    slack=True)

            if args.nat_gan and args.restore_nat_gan_d_sep:
                checkpoint_state_nat_gan = tf.train.get_checkpoint_state(
                    save_dir_nat_gan)
                saver_nat_gan.restore(
                    sess, checkpoint_state_nat_gan.model_checkpoint_path)
                log('Loaded Nat Gan Discriminator from checkpoint {}'.format(
                    checkpoint_state_nat_gan.model_checkpoint_path),
                    slack=True)

            #initializing feeder
            feeder.start_threads(sess)

            #Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                # vars = [global_step, model.loss, model.optimize,model.before_loss, model.after_loss,model.stop_token_loss,
                # 				model.regularization_loss,model.style_emb_loss_emt, model.style_emb_loss_spk, model.style_emb_orthog_loss]
                # out = [step, loss, opt, bef, aft, stop, reg, loss_emt, loss_spk, loss_orthog]
                # message = 'Step {:7d} {:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}, bef={:.5f}, aft={:.5f}, stop={:.5f},' \
                # 					'reg={:.5f}, emt={:.5f}, spk={:.5f}, orthog={:.5f}'.format(step, time_window.average, loss, loss_window.average,
                # 																																		 loss_bef_window.average, loss_aft_window.average,
                # 																																		 loss_stop_window.average, loss_reg_window.average,
                # 																																		 loss_emt_window.average, loss_spk_window.average,
                # 																																		 loss_orthog_window.average)
                # if args.unpaired:
                # 	vars += [model.style_emb_loss_up_emt, model.style_emb_loss_up_spk,model.style_emb_loss_mel_out_up_emt, model.style_emb_loss_mel_out_up_spk]
                # 	out += [loss_up_emt, loss_up_spk, loss_mo_up_emt, loss_mo_up_spk]
                # 	message += ' up_emt={:.5f}, up_spk={:.5f}, mo_up_emt={:.5f}, mo_up_spk={:.5f}]'.format(loss_up_emt_window.average,
                # 																																												loss_up_spk_window.average,
                # 																																												loss_mo_up_emt_window.average,
                # 																																												loss_mo_up_spk_window.average)
                # if False:
                # 	vars += [model.tower_style_emb_logit_emt[0], model.tower_emt_labels[0],model.tower_style_emb_logit_up_emt[0],
                # 					model.tower_emt_up_labels[0],model.tower_spk_labels[0]]
                # 	out += [emt_logit, emt_labels, emt_up_logit, emt_up_labels, spk_labels]
                #
                # out = sess.run([vars])

                if args.nat_gan and (args.restart_nat_gan_d
                                     or not (args.restore)) and step == 0:
                    log("Will start with Training Nat GAN Discriminator",
                        end='\r')
                    disc_epochs = 300 if args.unpaired else 200
                    disc_epochs = 0 if args.TEST else disc_epochs
                    for i in range(disc_epochs + 1):
                        d_loss_t, d_loss_p, d_loss_up,\
                        d_loss_t_emt, d_loss_p_emt, d_loss_up_emt, \
                        d_loss_t_spk, d_loss_p_spk, d_loss_up_spk, \
                        opt_n = sess.run([model.d_loss_targ, model.d_loss_p, model.d_loss_up,
                                                 model.d_loss_targ_emt, model.d_loss_p_emt, model.d_loss_up_emt,
                                                 model.d_loss_targ_spk, model.d_loss_p_spk, model.d_loss_up_spk,
                                                 model.optimize_n])
                        message = 'step: {}, d_loss_t={:.5f}, d_loss_p ={:.5f}, d_loss_up ={:.5f},' \
                             ' d_loss_t_emt={:.5f}, d_loss_p_emt ={:.5f}, d_loss_up_emt ={:.5f},' \
                             ' d_loss_t_spk={:.5f}, d_loss_p_spk ={:.5f}, d_loss_up_spk ={:.5f}'.format(i, d_loss_t, d_loss_p, d_loss_up,
                                                                 d_loss_t_emt, d_loss_p_emt, d_loss_up_emt,
                                                                 d_loss_t_spk, d_loss_p_spk, d_loss_up_spk)
                        log(message, end='\r')
                    os.makedirs(r'nat_gan', exist_ok=True)
                    os.makedirs(r'nat_gan/pretrained_model', exist_ok=True)
                    checkpoint_path_nat_gan = os.path.join(
                        save_dir_nat_gan, 'nat_gan_model.ckpt')
                    saver_nat_gan.save(sess,
                                       checkpoint_path_nat_gan,
                                       global_step=i)

                if args.nat_gan:
                    d_loss_t, d_loss_p, d_loss_up, opt_n = sess.run([
                        model.d_loss_targ, model.d_loss_p, model.d_loss_up,
                        model.optimize_n
                    ])

                if args.unpaired:
                    step, tfr, loss, opt, bef, aft, stop, reg, loss_emt, loss_spk, loss_orthog, \
                    loss_up_emt, loss_up_spk, loss_mo_up_emt, loss_mo_up_spk, g_loss_p, g_loss_up, mels, opt_r\
                    = sess.run([global_step, model.ratio, model.loss, model.optimize,model.before_loss, model.after_loss,model.stop_token_loss,
                        model.regularization_loss, model.style_emb_loss_emt, model.style_emb_loss_spk, model.style_emb_orthog_loss,
                        model.style_emb_loss_up_emt, model.style_emb_loss_up_spk,model.style_emb_loss_mel_out_up_emt,
                          model.style_emb_loss_mel_out_up_spk,model.g_loss_p, model.g_loss_up, model.tower_mel_outputs[0], model.optimize_r])

                else:
                    step, tfr, loss, opt, bef, aft, stop, reg, loss_emt, loss_spk, loss_orthog, \
                    loss_up_emt, loss_up_spk, loss_mo_up_emt, loss_mo_up_spk, g_loss_p, g_loss_up, mels,dec_out,opt_r = sess.run([global_step, model.helper._ratio, model.loss,
                        model.optimize, model.before_loss, model.after_loss, model.stop_token_loss,
                        model.regularization_loss, model.style_emb_loss_emt, model.style_emb_loss_spk, model.style_emb_orthog_loss,
                        model.style_emb_loss_up_emt, model.style_emb_loss_up_spk,model.style_emb_loss_mel_out_up_emt,
                        model.style_emb_loss_mel_out_up_spk, model.g_loss_p, model.g_loss_up, model.tower_mel_outputs[0],model.tower_decoder_output[0],model.optimize_r])

                    # step, loss, opt, bef, aft, stop, reg, loss_emt, loss_spk, loss_orthog, \
                    # loss_up_emt, loss_up_spk, loss_mo_up_emt, loss_mo_up_spk, g_loss_p, g_loss_up, mels,ref_emt,ref_spk,ref_up_emt,ref_up_spk,emb,enc_out,enc_out_up,\
                    # stop_pred, targ, inp, inp_len,targ_len,stop_targ,mels_up,dec_out,dec_out_up,opt_r\
                    # = sess.run([global_step, model.loss, model.optimize,model.before_loss, model.after_loss,model.stop_token_loss,
                    # 				model.regularization_loss, model.style_emb_loss_emt, model.style_emb_loss_spk, model.style_emb_orthog_loss,
                    # 				model.style_emb_loss_up_emt, model.style_emb_loss_up_spk,model.style_emb_loss_mel_out_up_emt,
                    # 						model.style_emb_loss_mel_out_up_spk,model.g_loss_p, model.g_loss_up, model.tower_mel_outputs[0],
                    # 						model.tower_refnet_out_emt[0],model.tower_refnet_out_spk[0],model.tower_refnet_out_up_emt[0],model.tower_refnet_out_up_spk[0],
                    # 						model.tower_embedded_inputs[0], model.tower_encoder_outputs[0],model.tower_encoder_outputs_up[0],model.tower_stop_token_prediction[0],
                    # 						model.tower_mel_targets[0],model.tower_inputs[0],model.tower_input_lengths[0],model.tower_targets_lengths[0],
                    # 						model.tower_stop_token_targets[0],model.tower_mel_outputs_up[0],model.tower_decoder_output[0],model.tower_decoder_output_up[0],model.optimize_r])
                    #
                    # if args.save_output_vars:
                    # 	import pandas as pd
                    # 	pd.DataFrame(emb[:, 0, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\emb.csv')
                    # 	pd.DataFrame(enc_out[:, 0, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\enc_out.csv')
                    # 	pd.DataFrame(enc_out_up[:, 0, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\enc_out_up.csv')
                    # 	pd.DataFrame(stop_pred[:, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\stop.csv')
                    # 	pd.DataFrame(targ[:, 0, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\targ.csv')
                    # 	pd.DataFrame(inp[:, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\inp.csv')
                    # 	pd.DataFrame(inp_len[:]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\inp_len.csv')
                    # 	pd.DataFrame(targ_len[:]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\targ_len.csv')
                    # 	pd.DataFrame(stop_targ[:, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\stop_targ.csv')
                    # 	pd.DataFrame(mels_up[:, 0, 0:5]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\mels_up.csv')
                    # 	pd.DataFrame(dec_out_up[:, 0, 0:5]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\dec_out_up.csv')

                    if args.save_output_vars:
                        import pandas as pd
                        pd.DataFrame(mels[:, 0, 0:5]).to_csv(
                            r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\mels.csv'
                        )
                        pd.DataFrame(dec_out[:, 0, 0:5]).to_csv(
                            r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\dec_out.csv'
                        )

                # import pandas as pd
                # print(emt_logit.shape, emt_labels.shape)
                # if len(emt_logit.shape)>2:
                # 	emt_logit = emt_logit.squeeze(1)
                # 	emt_up_logit = emt_up_logit.squeeze(1)
                # emt_labels = emt_labels.reshape(-1,1)
                # emt_up_labels = emt_up_labels.reshape(-1, 1)
                # spk_labels = spk_labels.reshape(-1, 1)
                # df = np.concatenate((emt_logit,emt_labels,spk_labels,emt_up_logit,emt_up_labels),axis=1)
                # print(emt_labels)
                # print(emt_logit)
                # print(emt_up_labels)
                # print(emt_up_logit)
                #
                # pd.DataFrame(df).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\emt_logit_.001_up_10k.csv')
                # raise

                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                loss_bef_window.append(bef)
                loss_aft_window.append(aft)
                loss_stop_window.append(stop)
                loss_reg_window.append(reg)
                loss_emt_window.append(loss_emt)
                loss_spk_window.append(loss_spk)
                loss_orthog_window.append(loss_orthog)
                loss_up_emt_window.append(loss_up_emt)
                loss_up_spk_window.append(loss_up_spk)
                loss_mo_up_emt_window.append(loss_mo_up_emt)
                loss_mo_up_spk_window.append(loss_mo_up_spk)

                if args.nat_gan:
                    d_loss_t_window.append(d_loss_t)
                    d_loss_p_window.append(d_loss_p)
                    d_loss_up_window.append(d_loss_up)
                    g_loss_p_window.append(g_loss_p)
                    g_loss_up_window.append(g_loss_up)

                message = 'Step {:7d} {:.3f} sec/step, tfr={:.3f}, loss={:.5f}, avg_loss={:.5f}, bef={:.5f}, aft={:.5f}, stop={:.5f}, reg={:.5f}'.format(
                    step, time_window.average, tfr, loss, loss_window.average,
                    loss_bef_window.average, loss_aft_window.average,
                    loss_stop_window.average, loss_reg_window.average)
                if args.emt_attn:
                    message += ' emt={:.5f}, spk={:.5f}, spk_l2={:.5f}'.format(
                        loss_emt_window.average, loss_spk_window.average,
                        loss_orthog_window.average)
                else:
                    message += ' emt={:.5f}, spk={:.5f}, orthog={:.5f},'.format(
                        loss_emt_window.average, loss_spk_window.average,
                        loss_orthog_window.average)
                if args.unpaired:
                    message += ' up_emt={:.5f}, up_spk={:.5f}, mo_up_emt={:.5f}, mo_up_spk={:.5f}'.format(
                        loss_up_emt_window.average, loss_up_spk_window.average,
                        loss_mo_up_emt_window.average,
                        loss_mo_up_spk_window.average)
                if args.nat_gan:
                    message += ' d_loss_t={:.5f}, d_loss_p ={:.5f}, d_loss_up ={:.5f}, g_loss_p ={:.5f}, g_loss_up ={:.5f}'.format(
                        d_loss_t_window.average, d_loss_p_window.average,
                        d_loss_up_window.average, g_loss_p_window.average,
                        g_loss_up_window.average)

                log(message,
                    end='\r',
                    slack=(step % args.checkpoint_interval == 0))

                if np.isnan(loss) or loss > 100.:
                    log('Loss exploded to {:.5f} at step {}'.format(
                        loss, step))
                    raise Exception('Loss exploded')

                if step % args.summary_interval == 0:
                    log('\nWriting summary at step {}'.format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                # if step % args.eval_interval == 0:
                # 	#Run eval and save eval stats
                # 	log('\nRunning evaluation and saving model at step {}'.format(step))
                # 	saver.save(sess, checkpoint_path, global_step=global_step)
                #
                # 	eval_losses = []
                # 	before_losses = []
                # 	after_losses = []
                # 	stop_token_losses = []
                # 	linear_losses = []
                # 	linear_loss = None
                #
                # 	if hparams.predict_linear:
                # 		for i in tqdm(range(feeder.test_steps)):
                # 			eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run([
                # 				eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0],
                # 				eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0],
                # 				eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0],
                # 				eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0],
                # 				eval_model.tower_linear_targets[0][0],
                # 				])
                # 			eval_losses.append(eloss)
                # 			before_losses.append(before_loss)
                # 			after_losses.append(after_loss)
                # 			stop_token_losses.append(stop_token_loss)
                # 			linear_losses.append(linear_loss)
                # 		linear_loss = sum(linear_losses) / len(linear_losses)
                #
                # 		if hparams.GL_on_GPU:
                # 			wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: lin_p})
                # 			wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
                # 		else:
                # 			wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
                # 		audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate)
                #
                # 	else:
                # 		for i in tqdm(range(feeder.test_steps)):
                # 			eloss, before_loss, after_loss, stop_token_loss, input_seq, mel_p, mel_t, t_len, align = sess.run([
                # 				eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0],
                # 				eval_model.tower_stop_token_loss[0],eval_model.tower_inputs[0][0], eval_model.tower_mel_outputs[0][0],
                # 				eval_model.tower_mel_targets[0][0],
                # 				eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0]
                # 				])
                # 			eval_losses.append(eloss)
                # 			before_losses.append(before_loss)
                # 			after_losses.append(after_loss)
                # 			stop_token_losses.append(stop_token_loss)
                #
                # 	eval_loss = sum(eval_losses) / len(eval_losses)
                # 	before_loss = sum(before_losses) / len(before_losses)
                # 	after_loss = sum(after_losses) / len(after_losses)
                # 	stop_token_loss = sum(stop_token_losses) / len(stop_token_losses)
                #
                # 	# log('Saving eval log to {}..'.format(eval_dir))
                # 	#Save some log to monitor model improvement on same unseen sequence
                # 	if hparams.GL_on_GPU:
                # 		wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_p})
                # 		wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
                # 	else:
                # 		wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
                # 	audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)
                #
                # 	input_seq = sequence_to_text(input_seq)
                # 	plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)),
                # 		title='{}, {}, step={}, loss={:.5f}\n{}'.format(args.model, time_string(), step, eval_loss, input_seq),
                # 		max_len=t_len // hparams.outputs_per_step)
                # 	plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)),
                # 		title='{}, {}, step={}, loss={:.5f}\n{}'.format(args.model, time_string(), step, eval_loss,input_seq), target_spectrogram=mel_t,
                # 		max_len=t_len)
                #
                # 	if hparams.predict_linear:
                # 		plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir, 'step-{}-eval-linear-spectrogram.png'.format(step)),
                # 			title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=lin_t,
                # 			max_len=t_len, auto_aspect=True)
                #
                # 	log('Step {:7d} [eval loss: {:.3f}, before loss: {:.3f}, after loss: {:.3f}, stop loss: {:.3f}]'.format(step, eval_loss, before_loss, after_loss, stop_token_loss))
                # 	# log('Writing eval summary!')
                # 	add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss)

                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300:
                    #Save model and current global step
                    saver.save(sess, checkpoint_path, global_step=global_step)

                    log('\nSaved model at step {}'.format(step))

                if step % args.eval_interval == 0:

                    if hparams.predict_linear:
                        raise ValueError('predict linear not implemented')
                        # input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run([
                        # 	model.tower_inputs[0][0],
                        # 	model.tower_mel_outputs[0][0],
                        # 	model.tower_linear_outputs[0][0],
                        # 	model.tower_alignments[0][0],
                        # 	model.tower_mel_targets[0][0],
                        # 	model.tower_targets_lengths[0][0],
                        # 	model.tower_linear_targets[0][0],
                        # 	])
                        #
                        # #save predicted linear spectrogram to disk (debug)
                        # linear_filename = 'linear-prediction-step-{}.npy'.format(step)
                        # np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False)
                        #
                        # #save griffin lim inverted wav for debug (linear -> wav)
                        # if hparams.GL_on_GPU:
                        # 	wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: linear_prediction})
                        # 	wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
                        # else:
                        # 	wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams)
                        # audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate)
                        #
                        # #Save real and predicted linear-spectrogram plot to disk (control purposes)
                        # plot.plot_spectrogram(linear_prediction, os.path.join(plot_dir, 'step-{}-linear-spectrogram.png'.format(step)),
                        # 	title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=linear_target,
                        # 	max_len=target_length, auto_aspect=True)

                    else:
                        input_seqs, mels, alignments,\
                        stop_tokens = sess.run([eval_model.tower_inputs,
                              eval_model.tower_mel_outputs,
                              eval_model.tower_alignments,
                              eval_model.tower_stop_token_prediction],
                                 feed_dict=eval_feed_dict)

                        # num_evals = len(input_seqs) if False else 1
                        # for i in range(num_evals):
                        # 	input_seq = input_seqs[i]
                        # 	mel_prediction = mel_predictions[i]
                        # 	alignment = alignments[i]
                        # 	target = targets[i]
                        # 	target_length = target_lengths[i]
                        # 	emt = emts[i]
                        # 	spk = spks[i]
                        # 	if args.emt_attn and args.attn=='simple':
                        # 		alignment_emt = alignments_emt[0][i]

                        # Linearize outputs (n_gpus -> 1D)
                        inp = [
                            inp for gpu_inp in input_seqs for inp in gpu_inp
                        ]
                        mels = [mel for gpu_mels in mels for mel in gpu_mels]
                        # targets = [target for gpu_targets in targets for target in gpu_targets]
                        alignments = [
                            align for gpu_aligns in alignments
                            for align in gpu_aligns
                        ]
                        stop_tokens = [
                            token for gpu_token in stop_tokens
                            for token in gpu_token
                        ]

                        try:
                            target_lengths = get_output_lengths(stop_tokens)

                            # Take off the batch wise padding
                            mels = [
                                mel[:target_length, :]
                                for mel, target_length in zip(
                                    mels, target_lengths)
                            ]

                            T2_output_range = (
                                -hparams.max_abs_value, hparams.max_abs_value
                            ) if hparams.symmetric_mels else (
                                0, hparams.max_abs_value)
                            mels = [
                                np.clip(m, T2_output_range[0],
                                        T2_output_range[1]) for m in mels
                            ]

                            folder_bucket = 'step_{}'.format(step // 500)
                            folder_wavs_save = os.path.join(
                                wav_dir, folder_bucket)
                            folder_plot_save = os.path.join(
                                plot_dir, folder_bucket)
                            os.makedirs(folder_wavs_save, exist_ok=True)
                            os.makedirs(folder_plot_save, exist_ok=True)

                            for i, (mel, align, basename,
                                    basename_ref) in enumerate(
                                        zip(mels, alignments, basenames,
                                            basenames_refs)):

                                #save griffin lim inverted wav for debug (mel -> wav)
                                if hparams.GL_on_GPU:
                                    wav = sess.run(
                                        GLGPU_mel_outputs,
                                        feed_dict={GLGPU_mel_inputs: mel})
                                    wav = audio.inv_preemphasis(
                                        wav, hparams.preemphasis,
                                        hparams.preemphasize)
                                else:
                                    wav = audio.inv_mel_spectrogram(
                                        mel.T, hparams)
                                audio.save_wav(
                                    wav,
                                    os.path.join(
                                        folder_wavs_save,
                                        'step_{}_wav_{}_{}_{}.wav'.format(
                                            step, i, basename, basename_ref)),
                                    sr=hparams.sample_rate)

                                input_seq = sequence_to_text(inp[i])
                                #save alignment plot to disk (control purposes)
                                try:
                                    plot.plot_alignment(
                                        align,
                                        os.path.join(
                                            folder_plot_save,
                                            'step_{}_wav_{}_{}_{}_align.png'.
                                            format(step, i, basename,
                                                   basename_ref)),
                                        title='{}, {}, step={}\n{}'.format(
                                            args.model, time_string(), step,
                                            input_seq),
                                        max_len=target_lengths[i] //
                                        hparams.outputs_per_step)
                                except:
                                    print("failed to plot alignment")
                                try:
                                    #save real and predicted mel-spectrogram plot to disk (control purposes)
                                    plot.plot_spectrogram(
                                        mel,
                                        os.path.join(
                                            folder_plot_save,
                                            'step-{}-{}-mel-spectrogram.png'.
                                            format(step, i)),
                                        title='{}, {}, step={}\n{}'.format(
                                            args.model, time_string(), step,
                                            input_seq))
                                    # target_spectrogram=targets[i],
                                    # max_len=target_lengths[i])
                                except:
                                    print("failed to plot spectrogram")

                            log('Saved synthesized samples for step {}'.format(
                                step),
                                end='\r')
                        except:
                            print("Couldn't synthesize samples")
                        # log('Input at step {}: {}'.format(step, input_seq), end='\r')

                # if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
                # 	#Get current checkpoint state
                # 	checkpoint_state = tf.train.get_checkpoint_state(save_dir)
                #
                # 	#Update Projector
                # 	log('\nSaving Model Character Embeddings visualization..')
                # 	add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path)
                # 	log('Tacotron Character embeddings have been updated on tensorboard!')

            log('Tacotron training complete after {} global steps!'.format(
                args.tacotron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log('Exiting due to exception: {}'.format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)