def mel_to_wav(self, mels):
        if self._sess is None:
            self._sess = tf.Session(graph=self._graph,
                                    config=_get_config_proto())

        time_1 = time.time()

        audio_lengths, b_mels = process_mels(mels)

        wavs = self._sess.run(self.output_op.outputs[0],
                              feed_dict={self.input_op.outputs[0]: b_mels})
        time_2 = time.time()

        if hparams.input_type == 'mulaw-quantize':
            wavs = util.inv_mulaw_quantize(wavs, mu=hparams.quantize_channels)

        wavs = [wav[:length] for wav, length in zip(wavs, audio_lengths)]

        #log('WaveNet synthesise {} samples in {:.2f} seconds'.format(np.array(wavs.shape, time_2 - time_1))
        if self.volatile:
            self._sess.close()

        return wavs
示例#2
0
    def initialize(self,
                   y,
                   c,
                   g,
                   input_lengths,
                   x=None,
                   synthesis_length=None):
        '''Initialize wavenet graph for train, eval and test cases.
		'''
        hparams = self._hparams
        self.is_training = x is not None
        self.is_evaluating = not self.is_training and y is not None
        #Set all convolutions to corresponding mode
        self.set_mode(self.is_training)

        log('Initializing Wavenet model.  Dimensions (? = dynamic shape): ')
        log('  Train mode:                {}'.format(self.is_training))
        log('  Eval mode:                 {}'.format(self.is_evaluating))
        log('  Synthesis mode:            {}'.format(not (
            self.is_training or self.is_evaluating)))
        with tf.variable_scope('inference') as scope:
            #Training
            if self.is_training:
                batch_size = tf.shape(x)[0]
                #[batch_size, time_length, 1]
                self.mask = self.get_mask(
                    input_lengths,
                    maxlen=tf.shape(x)[-1])  #To be used in loss computation
                #[batch_size, channels, time_length]
                y_hat = self.step(
                    x, c, g, softmax=False
                )  #softmax is automatically computed inside softmax_cross_entropy if needed

                if is_mulaw_quantize(hparams.input_type):
                    #[batch_size, time_length, channels]
                    self.y_hat_q = tf.transpose(y_hat, [0, 2, 1])

                self.y_hat = y_hat
                self.y = y
                self.input_lengths = input_lengths

                #Add mean and scale stats if using Guassian distribution output (there would be too many logistics if using MoL)
                if self._hparams.out_channels == 2:
                    self.means = self.y_hat[:, 0, :]
                    self.log_scales = self.y_hat[:, 1, :]
                else:
                    self.means = None

                #Graph extension for log saving
                #[batch_size, time_length]
                shape_control = (batch_size, tf.shape(x)[-1], 1)
                with tf.control_dependencies(
                    [tf.assert_equal(tf.shape(y), shape_control)]):
                    y_log = tf.squeeze(y, [-1])
                    if is_mulaw_quantize(hparams.input_type):
                        self.y = y_log

                y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4),
                                    lambda: tf.squeeze(y_hat, [-1]),
                                    lambda: y_hat)
                y_hat_log = tf.reshape(y_hat_log,
                                       [batch_size, hparams.out_channels, -1])

                if is_mulaw_quantize(hparams.input_type):
                    #[batch_size, time_length]
                    y_hat_log = tf.argmax(tf.nn.softmax(y_hat_log, axis=1), 1)

                    y_hat_log = util.inv_mulaw_quantize(
                        y_hat_log, hparams.quantize_channels)
                    y_log = util.inv_mulaw_quantize(y_log,
                                                    hparams.quantize_channels)

                else:
                    #[batch_size, time_length]
                    if hparams.out_channels == 2:
                        y_hat_log = sample_from_gaussian(
                            y_hat_log,
                            log_scale_min_gauss=hparams.log_scale_min_gauss)
                    else:
                        y_hat_log = sample_from_discretized_mix_logistic(
                            y_hat_log, log_scale_min=hparams.log_scale_min)

                    if is_mulaw(hparams.input_type):
                        y_hat_log = util.inv_mulaw(y_hat_log,
                                                   hparams.quantize_channels)
                        y_log = util.inv_mulaw(y_log,
                                               hparams.quantize_channels)

                self.y_hat_log = y_hat_log
                self.y_log = y_log

                log('  inputs:                    {}'.format(x.shape))
                if self.local_conditioning_enabled():
                    log('  local_condition:           {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:          {}'.format(g.shape))
                log('  targets:                   {}'.format(y_log.shape))
                log('  outputs:                   {}'.format(y_hat_log.shape))

            #evaluating
            elif self.is_evaluating:
                #[time_length, ]
                idx = 0
                length = input_lengths[idx]
                y_target = tf.reshape(y[idx], [-1])[:length]

                if c is not None:
                    c = tf.expand_dims(c[idx, :, :length], axis=0)
                    with tf.control_dependencies(
                        [tf.assert_equal(tf.rank(c), 3)]):
                        c = tf.identity(c, name='eval_assert_c_rank_op')
                if g is not None:
                    g = tf.expand_dims(g[idx], axis=0)

                batch_size = tf.shape(c)[0]

                #Start silence frame
                if is_mulaw_quantize(hparams.input_type):
                    initial_value = mulaw_quantize(0,
                                                   hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    initial_value = mulaw(0.0, hparams.quantize_channels)
                else:
                    initial_value = 0.0

                #[channels, ]
                if is_mulaw_quantize(hparams.input_type):
                    initial_input = tf.one_hot(indices=initial_value,
                                               depth=hparams.quantize_channels,
                                               dtype=tf.float32)
                    initial_input = tf.tile(
                        tf.reshape(initial_input,
                                   [1, 1, hparams.quantize_channels]),
                        [batch_size, 1, 1])
                else:
                    initial_input = tf.ones([batch_size, 1, 1],
                                            tf.float32) * initial_value

                #Fast eval
                y_hat = self.incremental(initial_input,
                                         c=c,
                                         g=g,
                                         time_length=length,
                                         softmax=False,
                                         quantize=True,
                                         log_scale_min=hparams.log_scale_min)

                #Save targets and length for eval loss computation
                if is_mulaw_quantize(hparams.input_type):
                    self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length]
                else:
                    self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :]
                self.eval_length = length

                if is_mulaw_quantize(hparams.input_type):
                    y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [-1])
                    y_hat = inv_mulaw_quantize(y_hat,
                                               hparams.quantize_channels)
                    y_target = inv_mulaw_quantize(y_target,
                                                  hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    y_hat = inv_mulaw(tf.reshape(y_hat, [-1]),
                                      hparams.quantize_channels)
                    y_target = inv_mulaw(y_target, hparams.quantize_channels)
                else:
                    y_hat = tf.reshape(y_hat, [-1])

                self.y_hat = y_hat
                self.y_target = y_target

                if self.local_conditioning_enabled():
                    log('  local_condition:           {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:          {}'.format(g.shape))
                log('  targets:                   {}'.format(y_target.shape))
                log('  outputs:                   {}'.format(y_hat.shape))

            #synthesizing
            else:
                batch_size = tf.shape(c)[0]
                if c is None:
                    assert synthesis_length is not None
                else:
                    #[batch_size, local_condition_time, local_condition_dimension(num_mels)]
                    message = (
                        'Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'
                        .format(hparams.cin_channels, c.shape))
                    with tf.control_dependencies(
                        [tf.assert_equal(tf.rank(c), 3, message=message)]):
                        c = tf.identity(c, name='synthesis_assert_c_rank_op')

                    Tc = tf.shape(c)[1]
                    upsample_factor = audio.get_hop_size(self._hparams)

                    #Overwrite length with respect to local condition features
                    synthesis_length = Tc * upsample_factor

                    #[batch_size, local_condition_dimension, local_condition_time]
                    #time_length will be corrected using the upsample network
                    c = tf.transpose(c, [0, 2, 1])

                #Start silence frame
                if is_mulaw_quantize(hparams.input_type):
                    initial_value = mulaw_quantize(0,
                                                   hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    initial_value = mulaw(0.0, hparams.quantize_channels)
                else:
                    initial_value = 0.0

                if is_mulaw_quantize(hparams.input_type):
                    assert initial_value >= 0 and initial_value < hparams.quantize_channels
                    initial_input = tf.one_hot(indices=initial_value,
                                               depth=hparams.quantize_channels,
                                               dtype=tf.float32)
                    initial_input = tf.tile(
                        tf.reshape(initial_input,
                                   [1, 1, hparams.quantize_channels]),
                        [batch_size, 1, 1])
                else:
                    initial_input = tf.ones([batch_size, 1, 1],
                                            tf.float32) * initial_value

                y_hat = self.incremental(initial_input,
                                         c=c,
                                         g=g,
                                         time_length=synthesis_length,
                                         softmax=False,
                                         quantize=True,
                                         log_scale_min=hparams.log_scale_min)

                if is_mulaw_quantize(hparams.input_type):
                    y_hat = tf.reshape(tf.argmax(y_hat, axis=1),
                                       [batch_size, -1])
                    self.out_node = y_hat
                    y_hat = util.inv_mulaw_quantize(y_hat,
                                                    hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    y_hat = util.inv_mulaw(tf.reshape(y_hat, [batch_size, -1]),
                                           hparams.quantize_channels)
                else:
                    y_hat = tf.reshape(y_hat, [batch_size, -1])

                self.y_hat = y_hat

                if self.local_conditioning_enabled():
                    log('  local_condition:            {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:           {}'.format(g.shape))
                log('  outputs:                    {}'.format(y_hat.shape))

        self.variables = tf.trainable_variables()
        self.ema = tf.train.ExponentialMovingAverage(
            decay=hparams.wavenet_ema_decay)
示例#3
0
    D = torch.load(f'model_gan/wD_epoch{model_epoch}step{model_step}.model', map_location='cpu')
    G = torch.load(f'model_gan/wG_epoch{model_epoch}step{model_step}.model', map_location='cpu')
    print(f'load last model epoch{model_epoch}step{model_step}')
if use_cuda:
    G.cuda()
    D.cuda()
else:
    G.cpu()
    D.cpu()


mel = np.load('/Users/edz/Desktop/work/tjTaco2/tacotron_output/gta/speech-mel-{0:0>5}.npy'.format(4))
mel = mel/6
mel = torch.FloatTensor(mel).unsqueeze(0)
z_ = torch.randn((1, mel.shape[1],80))
G_input    = torch.cat([z_,mel],2)
G_result = G(G_input)
G_result2=G_result
G_result = (G_result.view(-1)+1)*128

G_result = G_result.type(torch.LongTensor)
mu_gen = inv_mulaw_quantize(G_result.detach().numpy(), 256)
wav_name = f'waveGAN_results/{model_epoch}epo.wav'
import librosa

mu_gen = G_result.type(torch.FloatTensor).detach().numpy()/128.-1.

librosa.output.write_wav(wav_name, mu_gen, sr=22050)
save_image(mu_gen[:20000],model_epoch,2,image_len = 400)

示例#4
0
	def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None):
		'''Initialize wavenet graph for train, eval and test cases.
		'''
		hparams = self._hparams
		self.is_training = x is not None
		self.is_evaluating = not self.is_training and y is not None
		#Set all convolutions to corresponding mode
		self.set_mode(self.is_training)

		log('Initializing Wavenet model.  Dimensions (? = dynamic shape): ')
		log('  Train mode:                {}'.format(self.is_training))
		log('  Eval mode:                 {}'.format(self.is_evaluating))
		log('  Synthesis mode:            {}'.format(not (self.is_training or self.is_evaluating)))
		with tf.variable_scope('inference') as scope:
			#Training
			if self.is_training:
				batch_size = tf.shape(x)[0]
				#[batch_size, time_length, 1]
				self.mask = self.get_mask(input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation
				#[batch_size, channels, time_length]
				y_hat = self.step(x, c, g, softmax=False) #softmax is automatically computed inside softmax_cross_entropy if needed

				if is_mulaw_quantize(hparams.input_type):
					#[batch_size, time_length, channels]
					self.y_hat_q = tf.transpose(y_hat, [0, 2, 1])

				self.y_hat = y_hat
				self.y = y
				self.input_lengths = input_lengths

				#Graph extension for log saving
				#[batch_size, time_length]
				shape_control = (batch_size, tf.shape(x)[-1], 1)
				with tf.control_dependencies([tf.assert_equal(tf.shape(y), shape_control)]):
					y_log = tf.squeeze(y, [-1])
					if is_mulaw_quantize(hparams.input_type):
						self.y = y_log

				y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4),
					lambda: tf.squeeze(y_hat, [-1]),
					lambda: y_hat)
				y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1])

				if is_mulaw_quantize(hparams.input_type):
					#[batch_size, time_length]
					y_hat_log = tf.reduce_max(tf.nn.softmax(y_hat_log, axis=1), 1)

					y_hat_log = util.inv_mulaw_quantize(y_hat_log, hparams.quantize_channels)
					y_log = util.inv_mulaw_quantize(y_log, hparams.quantize_channels)

				else:
					#[batch_size, time_length]
					y_hat_log = sample_from_discretized_mix_logistic(
						y_hat_log, log_scale_min=hparams.log_scale_min)

					if is_mulaw(hparams.input_type):
						y_hat_log = util.inv_mulaw(y_hat_log, hparams.quantize_channels)
						y_log = util.inv_mulaw(y_log, hparams.quantize_channels)

				self.y_hat_log = y_hat_log
				self.y_log = y_log
				
				log('  inputs:                    {}'.format(x.shape))
				if self.local_conditioning_enabled():
					log('  local_condition:           {}'.format(c.shape))
				if self.has_speaker_embedding():
					log('  global_condition:          {}'.format(g.shape))
				log('  targets:                   {}'.format(y_log.shape))
				log('  outputs:                   {}'.format(y_hat_log.shape))


			#evaluating
			elif self.is_evaluating: 
				#[time_length, ]
				idx = 0
				length = input_lengths[idx]
				y_target = tf.reshape(y[idx], [-1])[:length]

				if c is not None:
					c = tf.expand_dims(c[idx, :, :length], axis=0)
					with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3)]):
						c = tf.identity(c, name='eval_assert_c_rank_op')
				if g is not None:
					g = g[idx]

				#Start silence frame
				if is_mulaw_quantize(hparams.input_type):
					initial_value = mulaw_quantize(0, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					initial_value = mulaw(0.0, hparams.quantize_channels)
				else:
					initial_value = 0.0

				#[channels, ]
				if is_mulaw_quantize(hparams.input_type):
					initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32)
					initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels])
				else:
					initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value

				#Fast eval
				y_hat = self.incremental(initial_input, c=c, g=g, time_length=length,
					softmax=True, quantize=True, log_scale_min=hparams.log_scale_min)

				#Save targets and length for eval loss computation
				if is_mulaw_quantize(hparams.input_type):
					self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length]
				else:
					self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :]
				self.eval_length = length

				if is_mulaw_quantize(hparams.input_type):
					y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1])
					y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels)
					y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels)
					y_target = inv_mulaw(y_target, hparams.quantize_channels)
				else:
					y_hat = tf.reshape(y_hat, [-1])

				self.y_hat = y_hat
				self.y_target = y_target

				if self.local_conditioning_enabled():
					log('  local_condition:           {}'.format(c.shape))
				if self.has_speaker_embedding():
					log('  global_condition:          {}'.format(g.shape))
				log('  targets:                   {}'.format(y_target.shape))
				log('  outputs:                   {}'.format(y_hat.shape))

			#synthesizing
			else:
				if c is None:
					assert synthesis_length is not None
				else:
					#[batch_size, local_condition_time, local_condition_dimension(num_mels)]
					message = ('Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'.format(
							hparams.cin_channels, c.shape))
					with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3, message=message)]):
						c = tf.identity(c, name='synthesis_assert_c_rank_op')

					Tc = tf.shape(c)[1]
					upsample_factor = audio.get_hop_size(self._hparams)

					#Overwrite length with respect to local condition features
					synthesis_length = Tc * upsample_factor

					#[batch_size, local_condition_dimension, local_condition_time]
					#time_length will be corrected using the upsample network
					c = tf.transpose(c, [0, 2, 1])

				#Start silence frame
				if is_mulaw_quantize(hparams.input_type):
					initial_value = mulaw_quantize(0, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					initial_value = mulaw(0.0, hparams.quantize_channels)
				else:
					initial_value = 0.0

				if is_mulaw_quantize(hparams.input_type):
					assert initial_value >= 0 and initial_value < hparams.quantize_channels
					initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32)
					initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels])
				else:
					initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value

				y_hat = self.incremental(initial_input, c=c, g=g, time_length=synthesis_length,
					softmax=True, quantize=True, log_scale_min=hparams.log_scale_min)

				if is_mulaw_quantize(hparams.input_type):
					y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1])
					y_hat = util.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
				elif is_mulaw(hparams.input_type):
					y_hat = util.inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels)
				else:
					y_hat = tf.reshape(y_hat, [-1])

				self.y_hat = y_hat

				if self.local_conditioning_enabled():
					log('  local_condition:            {}'.format(c.shape))
				if self.has_speaker_embedding():
					log('  global_condition:           {}'.format(g.shape))
				log('  outputs:                    {}'.format(y_hat.shape))

		self.variables = tf.trainable_variables()
		self.ema = tf.train.ExponentialMovingAverage(decay=hparams.wavenet_ema_decay)
示例#5
0
    def initialize(self,
                   y,
                   c,
                   g,
                   input_lengths,
                   x=None,
                   synthesis_length=None):
        '''Initialize wavenet graph for train, eval and test cases.
		'''
        hparams = self._hparams
        self.is_training = x is not None
        self.is_evaluating = not self.is_training and y is not None
        #Set all convolutions to corresponding mode
        self.set_mode(self.is_training)

        # split_device = '/cpu:0' if self._hparams.wavenet_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0'
        # with tf.device(split_device):
        # 	hp = self._hparams
        # 	lout_int = [tf.int32] * hp.wavenet_num_gpus
        # 	lout_float = [tf.float32] * hp.wavenet_num_gpus
        #
        # 	tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if input_lengths is not None else [input_lengths] * hp.wavenet_num_gpus
        #
        # 	tower_y = tf.split(y, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if y is not None else [y] * hp.wavenet_num_gpus
        # 	tower_x = tf.split(x, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if x is not None else [x] * hp.wavenet_num_gpus
        # 	tower_c = tf.split(c, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if self.local_conditioning_enabled() else [None] * hp.wavenet_num_gpus
        # 	tower_g = tf.split(g, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if self.global_conditioning_enabled() else [None] * hp.wavenet_num_gpus
        # 	tower_test_inputs = tf.split(test_inputs, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if test_inputs is not None else [test_inputs] * hp.wavenet_num_gpus
        #
        # self.tower_y_hat_q = []
        # self.tower_y_hat_train = []
        # self.tower_y = []
        # self.tower_input_lengths = []
        # self.tower_means = []
        # self.tower_log_scales = []
        # self.tower_y_hat_log = []
        # self.tower_y_log = []
        # self.tower_c = []
        # self.tower_y_eval = []
        # self.tower_eval_length = []
        # self.tower_y_hat = []
        # self.tower_y_target = []
        # self.tower_eval_c = []
        # self.tower_mask = []
        # self.tower_upsampled_local_features = []
        # self.tower_eval_upsampled_local_features = []
        # self.tower_synth_upsampled_local_features = []
        #
        log('Initializing Wavenet model.  Dimensions (? = dynamic shape): ')
        log('  Train mode:                {}'.format(self.is_training))
        log('  Eval mode:                 {}'.format(self.is_evaluating))
        log('  Synthesis mode:            {}'.format(not (
            self.is_training or self.is_evaluating)))

        #1. Declare GPU devices
        #gpus = ['/gpu:{}'.format(i) for i in range(hp.wavenet_num_gpus)]
        #for i in range(hp.wavenet_num_gpus):
        #with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device='/cpu:0', worker_device=gpus[i])):
        with tf.variable_scope('inference') as scope:
            #log('  device:                    {}'.format(i))
            #Training
            if self.is_training:
                batch_size = tf.shape(x)[0]
                #[batch_size, time_length, 1]
                self.mask = self.get_mask(
                    input_lengths,
                    maxlen=tf.shape(x)[-1])  #To be used in loss computation
                #[batch_size, channels, time_length]
                y_hat = self.step(
                    x, c, g, softmax=False
                )  #softmax is automatically computed inside softmax_cross_entropy if needed

                if is_mulaw_quantize(hparams.input_type):
                    #[batch_size, time_length, channels]
                    self.y_hat_q = tf.transpose(y_hat, [0, 2, 1])

                self.y_hat = y_hat
                self.y = y
                self.input_lengths = input_lengths

                #Add mean and scale stats if using Guassian distribution output (there would be too many logistics if using MoL)
                if self._hparams.out_channels == 2:
                    self.means = self.y_hat[:, 0, :]
                    self.log_scales = y_hat[:, 1, :]
                else:
                    self.means = None

                    #Graph extension for log saving
                    #[batch_size, time_length]
                shape_control = (batch_size, tf.shape(x)[-1], 1)
                with tf.control_dependencies(
                    [tf.assert_equal(tf.shape(y), shape_control)]):
                    y_log = tf.squeeze(y, [-1])
                    if is_mulaw_quantize(hparams.input_type):
                        self.y = y_log

                y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4),
                                    lambda: tf.squeeze(y_hat, [-1]),
                                    lambda: y_hat)
                y_hat_log = tf.reshape(y_hat_log,
                                       [batch_size, hparams.out_channels, -1])

                if is_mulaw_quantize(hparams.input_type):
                    #[batch_size, time_length]
                    y_hat_log = tf.argmax(tf.nn.softmax(y_hat_log, axis=1), 1)

                    y_hat_log = util.inv_mulaw_quantize(
                        y_hat_log, hparams.quantize_channels)
                    y_log = util.inv_mulaw_quantize(y_log,
                                                    hparams.quantize_channels)

                else:
                    #[batch_size, time_length]
                    if hparams.out_channels == 2:
                        y_hat_log = sample_from_gaussian(
                            y_hat_log,
                            log_scale_min_gauss=hparams.log_scale_min_gauss)
                    else:
                        y_hat_log = sample_from_discretized_mix_logistic(
                            y_hat_log, log_scale_min=hparams.log_scale_min)

                    if is_mulaw(hparams.input_type):
                        y_hat_log = util.inv_mulaw(y_hat_log,
                                                   hparams.quantize_channels)
                        y_log = util.inv_mulaw(y_log,
                                               hparams.quantize_channels)

                self.y_hat_log = y_hat_log
                self.y_log = y_log
                # self.tower_c.append(tower_c[i])
                # self.tower_upsampled_local_features.append(self.upsampled_local_features)

                log('  inputs:                    {}'.format(x.shape))
                if self.local_conditioning_enabled():
                    log('  local_condition:           {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:          {}'.format(g.shape))
                log('  targets:                   {}'.format(y_log.shape))
                log('  outputs:                   {}'.format(y_hat_log.shape))

                #evaluating
            elif self.is_evaluating:
                #[time_length, ]
                idx = 0
                length = input_lengths[idx]
                y_target = tf.reshape(y[idx], [-1])[:length]
                #test_inputs = tf.reshape(y_target, [1, -1, 1]) if not hparams.wavenet_natural_eval else None

                if c is not None:
                    c = tf.expand_dims(c[idx, :, :length], axis=0)
                    with tf.control_dependencies(
                        [tf.assert_equal(tf.rank(c), 3)]):
                        c = tf.identity(c, name='eval_assert_c_rank_op')

                if g is not None:
                    g = tf.expand_dims(g[idx], axis=0)

                batch_size = tf.shape(c)[0]

                #Start silence frame
                if is_mulaw_quantize(hparams.input_type):
                    initial_value = mulaw_quantize(0,
                                                   hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    initial_value = mulaw(0.0, hparams.quantize_channels)
                else:
                    initial_value = 0.0

                    #[channels, ]
                if is_mulaw_quantize(hparams.input_type):
                    initial_input = tf.one_hot(indices=initial_value,
                                               depth=hparams.quantize_channels,
                                               dtype=tf.float32)
                    initial_input = tf.tile(
                        tf.reshape(initial_input,
                                   [1, 1, hparams.quantize_channels]),
                        [batch_size, 1, 1])
                else:
                    initial_input = tf.ones([batch_size, 1, 1],
                                            tf.float32) * initial_value

                    #Fast eval
                y_hat = self.incremental(
                    initial_input,
                    c=c,
                    g=g,
                    time_length=length,
                    softmax=False,
                    quantize=True,
                    log_scale_min=hparams.log_scale_min,
                    log_scale_min_gauss=hparams.log_scale_min_gauss)

                #Save targets and length for eval loss computation
                if is_mulaw_quantize(hparams.input_type):
                    self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length]
                else:
                    self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :]
                self.eval_length = length

                if is_mulaw_quantize(hparams.input_type):
                    y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [-1])
                    y_hat = inv_mulaw_quantize(y_hat,
                                               hparams.quantize_channels)
                    y_target = inv_mulaw_quantize(y_target,
                                                  hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    y_hat = inv_mulaw(tf.reshape(y_hat, [-1]),
                                      hparams.quantize_channels)
                    y_target = inv_mulaw(y_target, hparams.quantize_channels)
                else:
                    y_hat = tf.reshape(y_hat, [-1])

                self.y_hat = y_hat
                self.y_target = y_target
                # self.tower_eval_c.append(tower_c[i][idx])
                # self.tower_eval_upsampled_local_features.append(self.upsampled_local_features[idx])

                if self.local_conditioning_enabled():
                    log('  local_condition:           {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:          {}'.format(g.shape))
                log('  targets:                   {}'.format(y_target.shape))
                log('  outputs:                   {}'.format(y_hat.shape))

                #synthesizing
            else:
                batch_size = tf.shape(c)[0]
                if c is None:
                    assert synthesis_length is not None
                else:
                    #[batch_size, local_condition_time, local_condition_dimension(num_mels)]
                    message = (
                        'Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'
                        .format(hparams.cin_channels, c.shape))
                with tf.control_dependencies(
                    [tf.assert_equal(tf.rank(c), 3, message=message)]):
                    c = tf.identity(c, name='synthesis_assert_c_rank_op')

                Tc = tf.shape(c)[1]
                upsample_factor = audio.get_hop_size(self._hparams)

                #Overwrite length with respect to local condition features
                synthesis_length = Tc * upsample_factor

                #[batch_size, local_condition_dimension, local_condition_time]
                #time_length will be corrected using the upsample network
                c = tf.transpose(c, [0, 2, 1])

            if g is not None:
                assert g.shape == (batch_size, 1)

                #Start silence frame
            if is_mulaw_quantize(hparams.input_type):
                initial_value = mulaw_quantize(0, hparams.quantize_channels)
            elif is_mulaw(hparams.input_type):
                initial_value = mulaw(0.0, hparams.quantize_channels)
            else:
                initial_value = 0.0

            if is_mulaw_quantize(hparams.input_type):
                assert initial_value >= 0 and initial_value < hparams.quantize_channels
                initial_input = tf.one_hot(indices=initial_value,
                                           depth=hparams.quantize_channels,
                                           dtype=tf.float32)
                initial_input = tf.tile(
                    tf.reshape(initial_input,
                               [1, 1, hparams.quantize_channels]),
                    [batch_size, 1, 1])
            else:
                initial_input = tf.ones([batch_size, 1, 1],
                                        tf.float32) * initial_value

            y_hat = self.incremental(
                initial_input,
                c=c,
                g=g,
                time_length=synthesis_length,
                softmax=False,
                quantize=True,
                log_scale_min=hparams.log_scale_min,
                log_scale_min_gauss=hparams.log_scale_min_gauss)

            if is_mulaw_quantize(hparams.input_type):
                y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [batch_size, -1])
                y_hat = util.inv_mulaw_quantize(y_hat,
                                                hparams.quantize_channels)
            elif is_mulaw(hparams.input_type):
                y_hat = util.inv_mulaw(tf.reshape(y_hat, [batch_size, -1]),
                                       hparams.quantize_channels)
            else:
                y_hat = tf.reshape(y_hat, [batch_size, -1])

            self.y_hat = y_hat
            #self.tower_synth_upsampled_local_features.append(self.upsampled_local_features)

            if self.local_conditioning_enabled():
                log('  local_condition:           {}'.format(c.shape))
            if self.has_speaker_embedding():
                log('  global_condition:          {}'.format(g.shape))
            log('  outputs:                   {}'.format(y_hat.shape))

        self.variables = tf.trainable_variables()
        log('  Receptive Field:           ({} samples / {:.1f} ms)'.format(
            self.receptive_field,
            self.receptive_field / hparams.sample_rate * 1000.))

        #1_000_000 is causing syntax problems for some people?! Python please :)
        log('  WaveNet Parameters:        {:.3f} Million.'.format(
            np.sum([np.prod(v.get_shape().as_list())
                    for v in self.variables]) / 1000000))

        self.ema = tf.train.ExponentialMovingAverage(
            decay=hparams.wavenet_ema_decay)