def initialize(self, inputs, inputs_tone_stress, speaker_labels, language_labels, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs speaker_labels: note the speaker id language_labels:note the language id - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) # self.inputs_printout = inputs # self.inputs_tone_stress_printout = inputs_tone_stress split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format( self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths tower_speaker_labels = tf.split( speaker_labels, num_or_size_splits=hp.tacotron_num_gpus, axis=0) # tower_language_labels = tf.split(language_labels, num_or_size_splits=hp.tacotron_num_gpus, axis=0) p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_inputs_tone_stress = tf.py_func( split_func, [inputs_tone_stress, split_infos[:, 0]], lout_int) p_language_labels = tf.py_func( split_func, [language_labels, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_inputs_tone_stress = [] tower_language_labels = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) tower_inputs_tone_stress.append( tf.reshape(p_inputs_tone_stress[i], [batch_size, -1])) tower_language_labels.append( tf.reshape(p_language_labels[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] self.tower_predict_speaker_labels = [] # 添加分别的phoneme embedding和 声调重读embedding 和 concat的inputs embedding tower_embedded_inputs_phoneme = [] tower_embedded_inputs_tone_stress = [] tower_embedded_inputs_concat = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = [ "/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus) ] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # tf.print(tower_inputs[i]) # tf.print(tower_inputs[i]) # phoneme Embeddings ==> [batch_size, sequence_length, embedding_dim], 512 self.phoneme_embedding_table = tf.get_variable( 'inputs_phoneme_embedding', [len(symbols), hp.phoneme_embedding_dim], dtype=tf.float32) embedded_inputs_phoneme = tf.nn.embedding_lookup( self.phoneme_embedding_table, tower_inputs[i]) # tone and stress Embeddings ==> [batch_size, sequence_length, embedding_dim], 16 self.tone_stress_embedding_table = tf.get_variable( 'inputs_tone_stress_embedding', [ tone_stress_symbols_max_no, hp.tone_stress_embedding_dim ], dtype=tf.float32) embedded_inputs_tone_stress = tf.nn.embedding_lookup( self.tone_stress_embedding_table, tower_inputs_tone_stress[i]) # 拼接, 512 + 16 embedded_inputs_concat = tf.concat( [embedded_inputs_phoneme, embedded_inputs_tone_stress], axis=-1) self.speaker_embedding_table = tf.get_variable( 'speaker_embedding', [hp.speaker_num, hp.speaker_dim], dtype=tf.float32) embedded_speaker_label = tf.nn.embedding_lookup( self.speaker_embedding_table, tower_speaker_labels[i]) self.language_embedding_table = tf.get_variable( 'language_embedding', [hp.language_num, hp.language_dim], dtype=tf.float32) embedded_language_label = tf.nn.embedding_lookup( self.language_embedding_table, tower_language_labels[i]) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs_concat, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #language concat # input_seq_len = tf.shape(encoder_outputs)[1] # _embedded_language_label = tf.expand_dims(embedded_language_label, axis=1) # _embedded_language_label = tf.tile(_embedded_language_label, multiples=[1, input_seq_len, 1]) self.input_seq_len = tf.shape(encoder_outputs)[1] self.language_len = tf.shape(embedded_language_label)[1] self.language_id_print = tower_language_labels[i] self.tone_stress_print = tower_inputs_tone_stress[i] LID_encoder_outputs = tf.concat( [encoder_outputs, embedded_language_label], axis=-1) # Adversarial Speaker-Classifiers, input:encoder_output,output:predicted speaker_label speaker_classify = Speaker_Classifier( is_training, layer_size=hp.softmax_hidden_layer, speaker_size=hp.speaker_num) predict_speaker_labels = speaker_classify( LID_encoder_outputs, hp.grad_rev_scale) # Variational AutoEncoder if is_training: VAE_cell = VAECell( VAEConvolutions(is_training, hparams=hp, scope='VAE_convolutions'), VAERNN(is_training, layers=hp.VAE_lstm_num_layers, size=hp.VAE_lstm_layer_size, zoneout=hp.tacotron_zoneout_rate, scope='VAE_LSTM'), hp.VAE_pool_size, hp.VAE_D_size) residual_encoding, self.kl_div, self.D_mean, self.D_var = VAE_cell( tower_mel_targets[i], hp.tacotron_batch_size) elif is_evaluating: residual_encoding, self.kl_div = tf.zeros( [hp.tacotron_batch_size, hp.VAE_D_size], dtype=tf.float32), 0 else: residual_encoding = tf.zeros( [hp.tacotron_synthesis_batch_size, hp.VAE_D_size], dtype=tf.float32) self.residual_encoding = residual_encoding #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, LID_encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, embedded_speaker_label, # embedded_language_label, residual_encoding, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) self.tower_predict_speaker_labels.append( predict_speaker_labels) tower_embedded_inputs_phoneme.append( embedded_inputs_phoneme) tower_embedded_inputs_tone_stress.append( embedded_inputs_tone_stress) tower_embedded_inputs_concat.append(embedded_inputs_concat) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(LID_encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_inputs_tone_stress = tower_inputs_tone_stress self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.tower_speaker_labels = tower_speaker_labels self.tower_language_labels = tower_language_labels self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(' device: {}'.format(i)) log(' phoneme embedding: {}'.format( tower_embedded_inputs_phoneme[i].shape)) log(' tone stress embedding: {}'.format( tower_embedded_inputs_tone_stress[i].shape)) log(' concat embedding: {}'.format( tower_embedded_inputs_concat[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None): with tf.variable_scope('inference') as scope: is_training = False # No entrenando batch_size = tf.shape(inputs)[0] hp = self._hparams # Tabla de embeddings de speaker embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # Encoder: Capas convolucionales y LSTM encoder_cell = TacotronEncoderCell( EncoderConvolutions( is_training, kernel_size=(5, ), channels=512, scope='encoder_convolutions' ), EncoderRNN( is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM') ) # Salida Encoder encoder_outputs = encoder_cell(embedded_inputs, input_lengths) # Decoder ### Definir elementos para el Decoder # Prenet: Dos capas de 256 unidades ReLU prenet = Prenet(is_training, layer_sizes=[256, 256], scope='decoder_prenet') # Red Atencion print("PARAMS ATTENTION", hp.attention_dim, encoder_outputs) attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs ) # Decoder LSTM decoder_lstm = DecoderRNN( is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm' ) # Frame projection: proyectar resultados a 80 mels frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') # Stop projection: Projectar salida usando token para separar las palabras individuales stop_projection = StopProjection(is_training, scope='stop_token_projection') # Unir todo decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) # self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio) # Modo de sintesis self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) # Poner Decoder en estado inicial - zero state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) max_iters = hp.max_iters if not is_training else None (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs: 1 output por entrada decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet: cinco capas convolucionales postnet = Postnet( is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions' ) # Resultados results = postnet(decoder_output) # Proyectar resultados a 80 mels results_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_results = results_projection(results) # Calcular espectrograma mel mel_outputs = decoder_output + projected_results # Tomar alineacion del ultimo estado del decoder alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.mel_targets = mel_targets
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, style_transfer=True, global_step=None, is_training=False, is_evaluating=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta and not style_transfer: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) if style_transfer and (not is_training) and (not is_evaluating): assert self._hparams.tacotron_style_reference_audio is not None or \ (self._hparams.tacotron_style_alignment is not None and len(self._hparams.tacotron_style_alignment) == self._hparams.tacotron_n_style_token) split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format( self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = [ "/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus) ] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) # For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape # style token layers self.style_embedding_table = tf.get_variable( 'style_token_embedding', [hp.tacotron_n_style_token, hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # in order to synthese audio in random weights, style_encoder_outputs[-1]==style_embedding_table[-1] if (is_training or is_evaluating) and style_transfer: style_encoder_cell = TacotronReferenceEncoderCell( ReferenceEncoder( hp, layer_sizes=hp.tacotron_reference_layer_size, is_training=is_training, activation=tf.nn.relu), tf.nn.rnn_cell.GRUCell( num_units=hp.tacotron_reference_gru_hidden_size ), StyleTokenLayer( output_size=hp. tacotron_style_encoder_outputs_size, is_training=is_training), hparams=hp) # style_encoder_outputs: [batch_size,1,hp.encoder_lstm_units*2] style_encoder_outputs = style_encoder_cell( tower_mel_targets[i], input_lengths=tower_targets_lengths[i], style_token_embedding=self.style_embedding_table) # [batch_size,seq_len,hp.encoder_lstm_units*2] # encoder_outputs = encoder_outputs + style_encoder_outputs # element-wise add seq_len = tf.shape(encoder_outputs)[1] style_encoder_outputs = tf.tile( style_encoder_outputs, multiples=[1, seq_len, 1]) encoder_outputs = tf.concat( [encoder_outputs, style_encoder_outputs], axis=-1) # concat elif (not is_training) and ( not is_evaluating ) and style_transfer: # synthesis with style transfer if hp.tacotron_style_alignment is not None and \ len(hp.tacotron_style_alignment) == hp.tacotron_n_style_token: # random weights # [n_style_tokens,] style_alignment = tf.convert_to_tensor( hp.tacotron_style_alignment, dtype=tf.float32) # [n_style_tokens,1]*[n_style_tokens, embed_size] -> [n_style_tokens,embed_size] -> [embed_size,] style_context = tf.reduce_sum( tf.expand_dims(style_alignment, axis=-1) * self.style_embedding_table, axis=0) # [1,embed_size] style_context = tf.expand_dims(style_context, axis=0) # [batch_size,1,embed_size], all synthesis audio should be one style style_context = tf.tile( tf.expand_dims(style_context, axis=0), multiples=[batch_size, 1, 1]) # encoder_outputs = encoder_outputs + style_context # element-wise add seq_len = tf.shape(encoder_outputs)[1] style_context = tf.tile(style_context, multiples=[1, seq_len, 1]) # dense? # style_context = tf.layers.dense(style_context, units=hp.tacotron_style_encoder_outputs_size, # activation=tf.nn.tanh) encoder_outputs = tf.concat( [encoder_outputs, style_context], axis=-1) # concat elif len(tower_mel_targets ) > 0 and tower_mel_targets[i] is not None: # reference audio style_encoder_cell = TacotronReferenceEncoderCell( ReferenceEncoder(hp, layer_sizes=hp. tacotron_reference_layer_size, is_training=is_training, activation=tf.nn.relu), tf.nn.rnn_cell.GRUCell( num_units=hp. tacotron_reference_gru_hidden_size), StyleTokenLayer( output_size=hp. tacotron_style_encoder_outputs_size, is_training=is_training), hparams=hp) # style_encoder_outputs: [batch_size,1,hp.encoder_lstm_units*2(hp.tacotron_style_encoder_outputs_size)] style_encoder_outputs = style_encoder_cell( tower_mel_targets[i], # audio style reference audio input_lengths=tower_targets_lengths[i], style_token_embedding=self. style_embedding_table) # [batch_size,seq_len,hp.encoder_lstm_units*2] # encoder_outputs = encoder_outputs + style_encoder_outputs # element-wise add seq_len = tf.shape(encoder_outputs)[1] style_encoder_outputs = tf.tile( style_encoder_outputs, multiples=[1, seq_len, 1]) encoder_outputs = tf.concat( [encoder_outputs, style_encoder_outputs], axis=-1) # concat # Decoder Parts # Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') # Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) # Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') # Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') # <stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) # only generate r frames(num_mels*r) in one decoder step, [batch_size,1,num_mels*r] # then put this decoder_cell with helper into dynamic_decode to generate [batch_size,decoder_steps,num_mels*r] decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) # Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) # initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) # Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None # Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) # Project residual to same dimension as mel spectrogram # ==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet') # [batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) # Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') # [batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) # Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) # 1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None): with tf.variable_scope('inference') as scope: is_training = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=(5, ), channels=512, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) # Define elements for decoder prenet = Prenet(is_training, layer_sizes=[256, 256], scope='decoder_prenet') # Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) # Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') # Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') # <stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) if is_training is True: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) max_iters = hp.max_iters if not is_training else None (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) # Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual # Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.mel_targets = mel_targets
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, split_infos=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) with tf.device('/cpu:0'): hp = self._hparams lout_int = [tf.int32] * hp.num_gpus lout_float = [tf.float32] * hp.num_gpus tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]], lout_float) p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] self.tower_linear_targets = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = [ "/gpu:{}".format(i) for i in range(hp.gpu_start_idx, hp.gpu_start_idx + hp.num_gpus) ] for i in range(hp.num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( embedding_table, tower_inputs[i]) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tower_input_lengths[i], smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], tower_stop_token_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions( is_training, hparams=hp, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection( hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) self.tower_linear_targets.append(linear_targets) log('initialiized done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.num_gpus + hp.gpu_start_idx): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.towerlinear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): hp = self._hparams batch_size = tf.shape(inputs)[0] gta = False T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else ( 0, hp.max_abs_value) with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) self.encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose self.enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = GMMAttention(self.encoder_outputs, input_lengths, is_training) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, num_attn_mixture=5) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] self.decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) self.stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) if hp.clip_outputs: self.decoder_output = tf.minimum( tf.maximum(self.decoder_output, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(self.decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') self.projected_residual = residual_projection(residual) #Compute the mel spectrogram self.mel_outputs = self.decoder_output + self.projected_residual if hp.clip_outputs: self.mel_outputs = tf.minimum( tf.maximum(self.mel_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, hp.batch_norm_position, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] self.post_outputs = post_cbhg(self.mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] self.linear_outputs = linear_specs_projection(self.post_outputs) if hp.clip_outputs: self.linear_outputs = tf.minimum( tf.maximum(self.linear_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Grab alignments from the final decoder state self.alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) log('initialisation done.') if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.mel_targets = mel_targets self.linear_targets = linear_targets self.targets_lengths = targets_lengths self.stop_token_targets = stop_token_targets self.gta = gta self.all_vars = tf.trainable_variables() self.is_training = is_training self.is_evaluating = is_evaluating log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format( self.enc_conv_output_shape)) log(' encoder out: {}'.format( self.encoder_outputs.shape)) log(' decoder out: {}'.format(self.decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( self.projected_residual.shape)) log(' mel out: {}'.format(self.mel_outputs.shape)) log(' linear out: {}'.format(self.linear_outputs.shape)) log(' <stop_token> out: {}'.format( self.stop_token_prediction.shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, gta=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio, gta) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not is_training else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.mel_targets = mel_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError('no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training: raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!') if gta and linear_targets is not None: raise ValueError('Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format(linear_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, targets_lengths=None, global_step=None, is_training=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None: raise ValueError( 'Mel targets are provided without corresponding token_targets') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0' with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels for i in range(hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else ( 0, hp.max_abs_value) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection( is_training, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) if hp.clip_outputs: decoder_output = tf.minimum( tf.maximum( decoder_output, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if hp.clip_outputs: mel_outputs = tf.minimum( tf.maximum( mel_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, split_infos=None, mel_references=None, speaker_id_target=None, nb_speaker=None, nb_sample=None, synthesize=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs ### With tacotron_phoneme_transcription=True : ### inputs: int32 Tensor with shape [N, T_in, T_cat] where N is batch size, T_in is number of ### steps in the input time series, T_cat is the number of categories used to describe phones/special ### characters and values are phones/special character encoding - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format( self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets if self._hparams.tacotron_multi_speaker: tower_speaker_id_target = tf.split( speaker_id_target, num_or_size_splits=hp.tacotron_num_gpus, axis=0 ) if speaker_id_target is not None else speaker_id_target tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): if not self._hparams.tacotron_phoneme_transcription: tower_inputs.append( tf.reshape(p_inputs[i], [batch_size, -1])) else: tower_inputs.append( tf.reshape(p_inputs[i], [batch_size, -1, nb_categories])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] if self._hparams.tacotron_multi_speaker: tower_speaker_id_pred = [] # 1. Declare GPU Devices gpus = [ "/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus) ] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta if not self._hparams.tacotron_phoneme_transcription: # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) else: # Embeddings ==> [batch_size, sequence_length, embedding_dim] # print("inputs : {}".format(inputs)) # print("p_inputs : {}".format(p_inputs)) # print("tower_inputs : {}".format(tower_inputs)) embedded_inputs = tf.layers.dense( inputs=tf.cast(tower_inputs[i], dtype=tf.float32), units=hp.embedding_dim) # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) # For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape enc_tensor_shape = tf.shape(encoder_outputs) # Speaker encoder if self._hparams.tacotron_multi_speaker: with tf.variable_scope('speaker_encoder'): conv = tf.layers.conv1d(inputs=mel_references, filters=512, kernel_size=(3, ), activation=None, padding='same') conv = tf.layers.conv1d(inputs=conv, filters=512, kernel_size=(3, ), activation=None, padding='same') with tf.variable_scope('biLSTM1'): fw_LSTM = tf.nn.rnn_cell.BasicLSTMCell( num_units=256) bw_LSTM = tf.nn.rnn_cell.BasicLSTMCell( num_units=256) outputs_biLSTM, states_biLSTM = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_LSTM, cell_bw=bw_LSTM, inputs=conv, dtype=tf.float32) outputs_biLSTM = tf.concat(outputs_biLSTM, axis=2) with tf.variable_scope('biLSTM2'): fw_LSTM = tf.nn.rnn_cell.BasicLSTMCell( num_units=256) bw_LSTM = tf.nn.rnn_cell.BasicLSTMCell( num_units=256) outputs_biLSTM, states_biLSTM = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_LSTM, cell_bw=bw_LSTM, inputs=outputs_biLSTM, dtype=tf.float32) outputs_biLSTM = tf.concat(outputs_biLSTM, axis=2) mean_over_time = tf.reduce_mean( input_tensor=outputs_biLSTM, axis=1) # shape (batch_size, speaker_emb_dim) speaker_embedding = tf.layers.dense( inputs=mean_over_time, units=self._hparams. tacotron_speaker_embedding_dim, name="speaker_embedding") repeat = tf.tile(speaker_embedding, [ 1, enc_tensor_shape[1] ]) # shape (batch_size, speaker_emb_dim*text_len) repeat = tf.reshape( repeat, (enc_tensor_shape[0], enc_tensor_shape[1], self._hparams.tacotron_speaker_embedding_dim) ) # shape (batch_size, text_len, speaker_emb_dim) # self.embedding_speaker = tf.get_variable('speaker_embedding', [nb_sample, self._hparams.tacotron_speaker_embedding_dim], dtype=tf.float32) self.embedding_speaker = speaker_embedding attention_inputs = tf.concat([encoder_outputs, repeat], axis=-1) if not synthesize: with tf.variable_scope('speaker_classifier'): speaker_prediction = tf.layers.dense( inputs=speaker_embedding, units=256) speaker_prediction = tf.layers.dense( inputs=speaker_prediction, units=nb_speaker ) # Softmax inside the loss else: attention_inputs = encoder_outputs # Decoder Parts # Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') # Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, attention_inputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) # Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') # Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') # <stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) # Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) # initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) # Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None # Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) # Project residual to same dimension as mel spectrogram # ==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet') # [batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) # Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') # [batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) # Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if not synthesize: if self._hparams.tacotron_multi_speaker: tower_speaker_id_pred.append(speaker_prediction) if post_condition: self.tower_linear_outputs.append(linear_outputs) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets if self._hparams.tacotron_multi_speaker: self.tower_speaker_id_pred = tower_speaker_id_pred self.tower_speaker_id_target = tower_speaker_id_target self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) # 1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, reference_mel=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError('no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training: raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!') if gta and linear_targets is not None: raise ValueError('Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) if hp.use_gst: #Global style tokens (GST) gst_tokens = tf.get_variable('style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape if is_training: reference_mel = mel_targets if reference_mel is not None: # Reference encoder refnet_outputs = reference_encoder( reference_mel, filters=hp.reference_filters, kernel_size=(3,3), strides=(2,2), encoder_cell=GRUCell(hp.reference_depth), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs if hp.use_gst: # Style attention style_attention = MultiheadAttention( tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) style_embeddings = style_attention.multi_head_attention() else: style_embeddings = tf.expand_dims(refnet_outputs, axis=1) # [N, 1, 128] else: print("Use random weight for GST.") random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, name="random_weights") style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]]) #Extend style embeddings to be compatible with encoder_outputs. #Make encoder_output's dimensions by concatenating style embeddings with a vector of all zeroes. #Preserves effect of both style and encoder_outputs. neg = tf.add(style_embeddings, tf.negative(style_embeddings)) style_embeddings = tf.concat([style_embeddings, neg], axis=-1) # Add style embedding to every text encoder state style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] encoder_outputs = tf.add(encoder_outputs, style_embeddings) #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.style_embeddings = style_embeddings self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' style embedding: %d' % style_embeddings.shape[-1]) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format(linear_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_prediction.shape))
def initialize(self, inputs, input_speaker_id, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] # embedding_table = tf.get_variable( # 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # Speaker Embeddings ==> [batch_size, embedding_dim] self.speaker_id_embedding_table = tf.get_variable( 'input_speaker_id_embedding', [hp.speaker_num, hp.speaker_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_speaker_id = tf.nn.embedding_lookup( self.speaker_id_embedding_table, input_speaker_id) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) print('inputs:', inputs) # inputs = tf.Print(inputs, [inputs], "inputs: ",summarize=9) encoder_outputs = encoder_cell(inputs, input_lengths) #first change encoder_outputs to concated version. #second add. need same dims #encoder_outputs = encoder_outputs + embedded_speaker_id ''' #first concat. input_seq_len = tf.shape(encoder_outputs)[1] print('!!!!!!!!!!before tile') embedded_speaker_id = tf.expand_dims(embedded_speaker_id, 1) embedded_speaker_id = tf.tile(embedded_speaker_id, multiples=[1, input_seq_len, 1]) print('!!!!!!!!!!after tile') id_encoder_outputs = tf.concat([encoder_outputs, embedded_speaker_id], axis=-1) ''' id_encoder_outputs = encoder_outputs #still use encoder_outputs #use keras version, but not run. ''' print('hhhhhhhhhhhhhhhhhhhhhhhhhhhh') hp_lambda = 1.0 Flip = GradientReversal(hp_lambda) Flip_encoder_outputs = Flip(encoder_outputs) ''' ''' #use tensorflow version, but star's is only 5 and i don't understand. Flip_encoder_outputs = flip_gradient(encoder_outputs, l=1.0) print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!', Flip_encoder_outputs, type(Flip_encoder_outputs)) densed_256_encoder_outputs = tf.layers.dense(Flip_encoder_outputs, 256, tf.nn.relu) softmax_encoder_outputs = tf.layers.dense(densed_256_encoder_outputs, hp.speaker_num, tf.nn.softmax) long_speaker_id = tf.reshape(input_speaker_id, shape = [tf.shape(inputs)[0], 1]) tiled_speaker_id = tf.tile(long_speaker_id, multiples=[1, tf.shape(softmax_encoder_outputs)[1]]) print('tiled_speaker_id', tiled_speaker_id) one_hot_speaker_id = tf.one_hot(tiled_speaker_id, depth=hp.speaker_num) print('one_hot_speaker_id', one_hot_speaker_id) #self.one_hot_speaker_id and self.softmax_encoder_outputs is at below #long_speaker_id = tf.expand_dims(long_speaker_id, axis=2) #dann_out = Dense(2)(dann_in) #Flip_encoder_outputs = ''' #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, id_encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, mel_targets, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, hp.batch_norm_position, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_speaker_id = input_speaker_id #self.one_hot_speaker_id and self.softmax_encoder_outputs #self.softmax_encoder_outputs = softmax_encoder_outputs #self.one_hot_speaker_id = one_hot_speaker_id self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' embedding: {}'.format(inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' id encoder out: {}'.format( id_encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format( linear_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, feature_targets=None, stop_token_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "feature_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - feature_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mgc + num_lf0 + num_vuv + num_bap, and values are entries in the spectrogram. Only needed for training. """ if feature_targets is None and stop_token_targets is not None: raise ValueError('no feature targets were provided but token_targets were given') if feature_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams target_depth = hp.num_mgc + hp.num_lf0 + hp.num_vuv + hp.num_bap assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(target_depth * hp.outputs_per_step, scope='mgc_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, target_depth * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, feature_targets, target_depth, hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, target_depth, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), target_depth] decoder_outputs = tf.reshape(frames_prediction, [batch_size, -1, target_depth]) stop_token_outputs = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_outputs) #Project residual to same dimension as target depth #==> [batch_size, decoder_steps * r, target_depth] residual_projection = FrameProjection(target_depth, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the final outputs final_outputs = decoder_outputs + projected_residual #Compute each feature outputs mgc_idx = 0 lf0_idx = mgc_idx + hp.num_mgc vuv_idx = lf0_idx + hp.num_lf0 bap_idx = vuv_idx + hp.num_vuv mgc_outputs = tf.slice(final_outputs, [0, 0, mgc_idx], [-1, -1, hp.num_mgc], name='mgc_outputs') lf0_outputs = tf.slice(final_outputs, [0, 0, lf0_idx], [-1, -1, hp.num_lf0]) lf0_outputs = tf.squeeze(lf0_outputs, axis=-1, name='lf0_outputs') vuv_outputs = tf.slice(final_outputs, [0, 0, vuv_idx], [-1, -1, hp.num_vuv], name='vuv_outputs') bap_outputs = tf.slice(final_outputs, [0, 0, bap_idx], [-1, -1, hp.num_bap], name='bap_outputs') #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0], name='alignments') if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_outputs = decoder_outputs self.final_outputs = final_outputs self.feature_targets = feature_targets self.alignments = alignments self.stop_token_outputs = stop_token_outputs self.stop_token_targets = stop_token_targets self.lf0_outputs = lf0_outputs self.mgc_outputs = mgc_outputs self.vuv_outputs = vuv_outputs self.bap_outputs = bap_outputs self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_outputs.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' final out: {}'.format(final_outputs.shape)) log(' lf0 out: {}'.format(tf.expand_dims(lf0_outputs, axis=-1).shape)) log(' mgc out: {}'.format(mgc_outputs.shape)) log(' vuv out: {}'.format(vuv_outputs.shape)) log(' bap out: {}'.format(bap_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_outputs.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError('no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError('Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training: raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!') if gta and linear_targets is not None: raise ValueError('Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') if is_training and is_evaluating: raise RuntimeError('Model can not be in training and evaluation modes at the same time!') with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams if hp.tacotron_curriculum_dropout_rate: assert global_step is not None self.dropout_rate = self._curriculum_dropout( hp.tacotron_dropout_rate, hp.tacotron_curriculum_dropout_gamma, global_step) else: self.dropout_rate = tf.convert_to_tensor( hp.tacotron_dropout_rate) if hp.tacotron_curriculum_zoneout_rate: assert global_step is not None self.zoneout_rate = self._curriculum_dropout( hp.tacotron_zoneout_rate, hp.tacotron_curriculum_zoneout_gamma, global_step) else: self.zoneout_rate = tf.convert_to_tensor( hp.tacotron_zoneout_rate) assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hp.enc_conv_kernel_size, hp.enc_conv_channels, hp.enc_conv_num_layers, self.dropout_rate, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=self.zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=self.dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=self.zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) #Define the helper for our decoder if is_training or is_evaluating or gta: if mel_targets is not None and stop_token_targets is not None: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step) else: if gta: log('Warning: gta set to True but mel_targets or ' + 'mel_targets or stop_token_targets not provided' + ', falling back to natural inference') self.helper = TacoTestHelper(batch_size, hp) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, hp.postnet_kernel_size, hp.postnet_channels, hp.postnet_num_layers, self.dropout_rate, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = tf.add(decoder_output, projected_residual, name='mel_outputs') if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hp.enc_conv_kernel_size, hp.enc_conv_channels, hp.enc_conv_num_layers, self.dropout_rate, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=self.zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0], name='alignments') self.optimize = None self.loss = None if is_training: self.ratio = self.helper._ratio self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format(linear_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_prediction.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, mel_lengths=None, stop_token_targets=None, linear_targets=None, gta=False, reference_mel=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if gta == False and self._hparams.predict_linear == True and linear_targets is None: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable('inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) if hp.use_vae: if is_training: reference_mel = mel_targets style_embeddings, mu, log_var = VAE(inputs=reference_mel, input_lengths=mel_lengths, filters=hp.filters, kernel_size=(3, 3), strides=(2, 2), num_units=hp.vae_dim, is_training=is_training, scope='vae') self.mu = mu self.log_var = log_var style_embeddings = tf.layers.dense(style_embeddings, hp.encoder_depth) style_embeddings = tf.expand_dims(style_embeddings, axis=1) style_embeddings = tf.tile( style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 256] encoder_outputs = encoder_outputs + style_embeddings #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') #Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') #<stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell(prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper( batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio, gta) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not is_training else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder post_processing_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='post_processing_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) expand_outputs = post_processing_cell(mel_outputs) linear_outputs = FrameProjection( hp.num_freq, scope='post_processing_projection')(expand_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.reference_mel = reference_mel if post_condition: self.linear_outputs = linear_outputs self.linear_targets = linear_targets self.mel_targets = mel_targets self.mel_lengths = mel_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) if post_condition: log(' linear out: {}'.format( linear_outputs.shape)) log(' <stop_token> out: {}'.format( stop_token_prediction.shape))
def initialize(self, inputs=None, input_lengths=None, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, mel_references=None, references_lengths=None, vae_codes=None, gta=False, global_step=None, use_vae=False, is_training=False, is_evaluating=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ self.use_vae = use_vae if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) if use_vae and mel_references is None and vae_codes is None: if mel_targets is None: raise ValueError( 'Mel targets must be provided if neither mel references nor references codes are given!' ) else: mel_references = mel_targets if use_vae and references_lengths is None and vae_codes is None: if targets_lengths is None: raise ValueError( 'Targets lengths must be provided if neither references lengths nor references codes are given!' ) else: references_lengths = targets_lengths split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0' with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if input_lengths is not None else input_lengths tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths tower_references_lengths = tf.split( references_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0 ) if references_lengths is not None else references_lengths tower_vae_codes = tf.split( vae_codes, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if vae_codes is not None else vae_codes p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) if inputs is not None else inputs p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_mel_references = tf.py_func( split_func, [mel_references, split_infos[:, 1]], lout_float) if mel_references is not None else mel_references p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets tower_inputs = [] tower_mel_targets = [] tower_mel_references = [] tower_stop_token_targets = [] tower_linear_targets = [] batch_size = tf.shape( inputs)[0] if inputs is not None else tf.shape( mel_references)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): if p_inputs is not None: tower_inputs.append( tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_mel_references is not None: tower_mel_references.append( tf.reshape(p_mel_references[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else ( 0, hp.max_abs_value) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] self.tower_vae_codes = [] self.tower_mu = [] self.tower_log_var = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: #VAE Parts if use_vae and inputs is None: vae_code = None if mel_references is not None: vae_code, mu, log_var = VAE( inputs=tower_mel_references[i], input_lengths=tower_references_lengths[i], filters=hp.vae_filters, kernel_size=hp.vae_kernel, stride=hp.vae_stride, num_units=hp.vae_dim, rnn_units=hp.vae_rnn_units, bnorm=hp.batch_norm_position, log_var_minimum=hp.vae_log_var_minimum, is_training=is_training, scope='vae') #Use vae_codes first if provided if vae_codes is not None: vae_code = tower_vae_codes[i] self.tower_vae_codes.append(vae_code) if mel_references is not None: self.tower_mu.append(mu) self.tower_log_var.append(log_var) if inputs is not None: assert hp.tacotron_teacher_forcing_mode in ( 'constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #When predicting mels to train Wavenet vocoder, post processing should be ommited post_condition = hp.predict_linear # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup( self.embedding_table, tower_inputs[i]) #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) #For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape vae_code = None if use_vae: if mel_references is not None: vae_code, mu, log_var = VAE( inputs=tower_mel_references[i], input_lengths=tower_references_lengths[i], filters=hp.vae_filters, kernel_size=hp.vae_kernel, stride=hp.vae_stride, num_units=hp.vae_dim, rnn_units=hp.vae_rnn_units, bnorm=hp.batch_norm_position, log_var_minimum=hp.vae_log_var_minimum, is_training=is_training, scope='vae') #Use vae_codes first if provided if vae_codes is not None: vae_code = tower_vae_codes[i] if hp.encoder_outputs_revision_type == 'add': encoder_outputs = encoder_outputs + tf.tile( tf.expand_dims(tf.layers.dense( vae_code, hp.encoder_lstm_units * 2, name='vae_code_projection', activation='tanh'), axis=1), [1, tf.shape(encoder_outputs)[1], 1]) elif hp.encoder_outputs_revision_type == 'multiply': encoder_outputs = tf.math.multiply( encoder_outputs, tf.tile( tf.expand_dims(tf.layers.dense( vae_code, hp.encoder_lstm_units * 2, name='vae_code_projection', activation='tanh'), axis=1), [1, tf.shape(encoder_outputs)[1], 1])) #Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveAttention( hp.attention_dim, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) #Decoder LSTM Cells decoder_lstm = DecoderRNN( is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection( is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, vae_code, hp) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape( frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape( stop_token_prediction, [batch_size, -1]) if hp.clip_outputs: decoder_output = tf.minimum( tf.maximum( decoder_output, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Postnet postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) #Project residual to same dimension as mel spectrogram #==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection( hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if hp.clip_outputs: mel_outputs = tf.minimum( tf.maximum( mel_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, hp.batch_norm_position, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection( post_outputs) if hp.clip_outputs: linear_outputs = tf.minimum( tf.maximum( linear_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1]) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append( enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) if use_vae: self.tower_vae_codes.append(vae_code) if mel_references is not None: self.tower_mu.append(mu) self.tower_log_var.append(log_var) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) if inputs is not None: log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) if use_vae: log(' vae code: {}'.format( self.tower_vae_codes[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' projected residual out: {}'.format( tower_projected_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, speaker, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, reference_mels=None, \ reference_lengths=None, global_step=None, is_training=False, is_evaluating=False, split_infos=None,Lf0=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - speaker: int32 tensor with shape [N] - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError( 'no multi targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError( 'Mel targets are provided without corresponding token_targets') if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training: raise ValueError( 'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!' ) if gta and linear_targets is not None: raise ValueError( 'Linear spectrogram prediction is not supported in GTA mode!') if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( 'Model set to mask paddings but no targets lengths provided for the mask!' ) if is_training and is_evaluating: raise RuntimeError( 'Model can not be in training and evaluation modes at the same time!' ) split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format( self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_speaker = tf.split(speaker, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_input_lengths = tf.split( input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = tf.split( targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths tower_reference_lengths = tf.split( reference_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if reference_lengths is not None else reference_lengths p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_float) p_Lf0 = tf.py_func(split_func, [Lf0, split_infos[:, 5]], lout_float) p_mel_targets = tf.py_func( split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.py_func( split_func, [stop_token_targets, split_infos[:, 2]], lout_float ) if stop_token_targets is not None else stop_token_targets p_linear_targets = tf.py_func( split_func, [linear_targets, split_infos[:, 3]], lout_float) if linear_targets is not None else linear_targets p_reference_mels = tf.py_func( split_func, [reference_mels, split_infos[:, 4]], lout_float) if reference_mels is not None else reference_mels tower_inputs = [] tower_Lf0 = [] tower_mel_targets = [] tower_stop_token_targets = [] tower_linear_targets = [] tower_reference_mels = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels linear_channels = hp.num_freq for i in range(hp.tacotron_num_gpus): tower_inputs.append( tf.reshape(p_inputs[i], [batch_size, -1, 345])) tower_Lf0.append(tf.reshape( p_Lf0[i], [batch_size, -1, 2 ])) #2020.7.9.16:15 Notice! 2dims!!!here! if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) if p_linear_targets is not None: tower_linear_targets.append( tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels])) if p_reference_mels is not None: tower_reference_mels.append( tf.reshape(p_reference_mels[i], [batch_size, -1, mel_channels])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] self.tower_linear_outputs = [] self.styleembedding = None tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = [ "/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus) ] for i in range(hp.tacotron_num_gpus): with tf.device( tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.variable_scope('inference') as scope: assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: assert global_step is not None #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] #self.embedding_Dense = tf.layers.Dense(units=hp.embedding_dim, activation=tf.nn.relu, name='emb_Dense') # self.embedding_table = tf.get_variable( # 'inputs_embedding', [hp.PPGs_length, hp.embedding_dim], dtype=tf.float32) # embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i]) #embedded_inputs = self.embedding_Dense(tower_inputs[i]) embedded_inputs = tower_inputs[i] Lf0s = tower_Lf0[i] if hp.use_multispeaker: self.speaker_embedding_table = tf.get_variable( 'speaker_embedding', [hp.speaker_num, hp.speaker_dim], dtype=tf.float32) speaker_embedding = tf.nn.embedding_lookup( self.speaker_embedding_table, tower_speaker[i]) self.speaker_embedding = speaker_embedding #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder = TacotronEncoder( EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder(embedded_inputs, tower_input_lengths[i]) #F0-consists encoder_outputs = tf.concat([encoder_outputs, Lf0s], axis=-1) self.z_embedding = None if hp.use_style_encoder: # Reference input encoder. reference_embeddings = None if reference_mels is not None: reference_encoder = ReferenceEncoder( hp, is_training, scope='reference_encoder') reference_embeddings = reference_encoder( tower_reference_mels[i]) if hp.style_encoder_type == 'gst': style_encoder = GstEncoder(hp, is_training, scope='gst_encoder') style_outputs = style_encoder(reference_embeddings) elif hp.style_encoder_type == 'vae': style_encoder = VaeEncoder(hp, is_training, scope='vae_encoder') style_outputs = style_encoder( reference_embeddings, batch_size) self.z_mu = style_outputs['z_mu'] self.z_log_var = style_outputs['z_log_var'] self.z_embedding = style_outputs['z_embedding'] else: raise ValueError( "Only supported gst and vae and cvae!") style_embeddings = style_outputs[ 'style_embedding'] #[N,1,style_embed_depth] self.styleembedding = style_embeddings if hp.concat_style: style_embeddings = tf.tile( style_embeddings, [1, tf.shape(tower_inputs[i])[1], 1]) encoder_outputs = tf.concat( [encoder_outputs, style_embeddings], axis=-1) else: # encoder_outputs += tf.nn.tanh(style_embeddings)# #修改成concat boraded_style = tf.tile( style_embeddings, [1, tf.shape(encoder_outputs)[1], 1]) encoder_outputs = tf.concat( [encoder_outputs, boraded_style], axis=-1) if hp.use_multispeaker: speaker_embedding = tf.expand_dims(speaker_embedding, axis=1) speaker_embedding = tf.tile( speaker_embedding, [1, tf.shape(tower_inputs[i])[1], 1]) encoder_outputs = tf.concat( [encoder_outputs, speaker_embedding], axis=-1) #For shape visualization purpose enc_conv_output_shape = encoder.conv_output_shape #Decoder Parts #Attention Decoder Prenet prenet = Prenet(hp, is_training, scope='decoder_prenet') #Attention Mechanism attention_mechanism = LocationSensitiveSoftAttention( hp.attention_dim, encoder_outputs, tf.reshape(tower_input_lengths[i], [-1])) #Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM') #Frames Projection layer frame_projection = FrameProjection( hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection') #<stop_token> projection layer stop_projection = StopProjection( is_training, shape=hp.outputs_per_step, scope='stop_token_projection') #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, self.z_embedding) #Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper( batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) #initial decoder state decoder_init_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32) #Only use max iterations at synthesis time max_iters = hp.max_iters if not ( is_training or is_evaluating) else None #Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) #Postnet postnet = Postnet(hparams=hp, training=is_training, output_size=hp.num_mels, scope='postnet_convolutions') #Compute residual using post-net ==> [batch_size, decoder_steps * r, num_mels] residual = postnet(decoder_output) #Compute the mel spectrogram mel_outputs = decoder_output + residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) #Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection( hp.num_freq, scope='cbhg_linear_specs_projection') #[batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) #Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append( stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_outputs.append(encoder_outputs) tower_residual.append(residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) log('initialisation done {}'.format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_speaker = tower_speaker self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.tower_reference_mels = tower_reference_mels self.tower_reference_lengths = tower_reference_lengths self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Eval mode: {}'.format(is_evaluating)) log(' GTA mode: {}'.format(gta)) log(' Synthesis mode: {}'.format(not ( is_training or is_evaluating))) log(' Input: {}'.format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(' device: {}'.format(i)) log(' embedding: {}'.format( tower_embedded_inputs[i].shape)) log(' enc conv out: {}'.format( tower_enc_conv_output_shape[i])) log(' encoder out: {}'.format( tower_encoder_outputs[i].shape)) log(' decoder out: {}'.format( self.tower_decoder_output[i].shape)) log(' residual out: {}'.format( tower_residual[i].shape)) log(' mel out: {}'.format( self.tower_mel_outputs[i].shape)) if post_condition: log(' linear out: {}'.format( self.tower_linear_outputs[i].shape)) log(' <stop_token> out: {}'.format( self.tower_stop_token_prediction[i].shape)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))