def __init__(self, out_units, attention_cell: AttentionRNN, is_training, zoneout_factor_cell=0.0, zoneout_factor_output=0.0, lstm_impl=LSTMImpl.LSTMCell, trainable=True, name=None, **kwargs): super(DecoderRNNV2, self).__init__(name=name, trainable=trainable, **kwargs) self._cell = MultiRNNCell([ OutputProjectionWrapper(attention_cell, out_units), ZoneoutLSTMCell(out_units, is_training, zoneout_factor_cell, zoneout_factor_output, lstm_impl=lstm_impl), ZoneoutLSTMCell(out_units, is_training, zoneout_factor_cell, zoneout_factor_output, lstm_impl=lstm_impl), ], state_is_tuple=True)
def inference_decode(enc_outputs, seq_len, embeddings, out_dim): tiled_enc_outputs = tf.contrib.seq2seq.tile_batch(enc_outputs, hp.beam_width) tiled_seq_len = tf.contrib.seq2seq.tile_batch(seq_len, hp.beam_width) beam_batch_size = tf.shape(tiled_enc_outputs)[0] # start tokens, end token start_tokens = tf.tile([hp.START_TOKEN], [beam_batch_size // hp.beam_width]) end_token = hp.END_TOKEN dec_prenet_outputs = DecoderPrenetWrapper(GRUCell(hp.embed_size), is_training=False, prenet_sizes=hp.embed_size, dropout_prob=hp.dropout) attention_mechanism = BahdanauAttention( hp.embed_size, tiled_enc_outputs, normalize=True, memory_sequence_length=tiled_seq_len, probability_fn=tf.nn.softmax) attn_cell = AttentionWrapper(dec_prenet_outputs, attention_mechanism, alignment_history=True, output_attention=False) concat_cell = ConcatOutputAndAttentionWrapper(attn_cell) decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.embed_size), ResidualWrapper(GRUCell(hp.embed_size)), ResidualWrapper(GRUCell(hp.embed_size)) ], state_is_tuple=True) output_cell = OutputProjectionWrapper(decoder_cell, out_dim) initial_state = output_cell.zero_state(batch_size=beam_batch_size, dtype=tf.float32) decoder = BeamSearchDecoder(cell=output_cell, embedding=embeddings, start_tokens=start_tokens, end_token=end_token, initial_state=initial_state, beam_width=hp.beam_width) outputs, t1, t2 = tf.contrib.seq2seq.dynamic_decode( decoder, maximum_iterations=hp.max_len) return outputs
def __init__(self, out_units, attention_cell: AttentionRNN, trainable=True, name=None, **kwargs): super(DecoderRNNV1, self).__init__(name=name, trainable=trainable, **kwargs) self._cell = MultiRNNCell([ OutputProjectionWrapper(attention_cell, out_units), ResidualWrapper(GRUCell(out_units)), ResidualWrapper(GRUCell(out_units)), ], state_is_tuple=True)
def __init__(self, num_units, state_cell, vocab_size, max_utt_len, config, num_zt=10, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=None, reuse=None, name=None, dtype=None): self._state_is_tuple = state_is_tuple self.num_zt = num_zt self.tau = tf.Variable(5.0, name="temperature") self.vocab_size = vocab_size self.max_utt_len = max_utt_len self.config = config if self.config.word_weights: self.weights = tf.constant(self.config.word_weights) else: self.weights = self.config.word_weights self.decoder_cell_1 = self.get_rnncell('lstm', 200 + num_zt, keep_prob=self.config.keep_prob) self.decoder_cell_1 = OutputProjectionWrapper(self.decoder_cell_1, self.vocab_size) self.decoder_cell_2 = self.get_rnncell('lstm', 2 * (200 + num_zt), keep_prob=self.config.keep_prob) self.decoder_cell_2 = OutputProjectionWrapper(self.decoder_cell_2, self.vocab_size) self.state_cell = state_cell
def training_decode(enc_outputs, seq_len, helper, out_dim): dec_prenet_outputs = DecoderPrenetWrapper(GRUCell(hp.embed_size), is_training=True, prenet_sizes=hp.embed_size, dropout_prob=hp.dropout) attention_mechanism = BahdanauAttention(hp.embed_size, enc_outputs, normalize=True, memory_sequence_length=seq_len, probability_fn=tf.nn.softmax) attn_cell = AttentionWrapper(dec_prenet_outputs, attention_mechanism, alignment_history=True, output_attention=False) concat_cell = ConcatOutputAndAttentionWrapper(attn_cell) decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.embed_size), ResidualWrapper(GRUCell(hp.embed_size)), ResidualWrapper(GRUCell(hp.embed_size)) ], state_is_tuple=True) output_cell = OutputProjectionWrapper(decoder_cell, out_dim) initial_state = output_cell.zero_state(batch_size=tf.shape(enc_outputs)[0], dtype=tf.float32) decoder = BasicDecoder(cell=output_cell, helper=helper, initial_state=initial_state) (outputs, _), last_state, _ = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, maximum_iterations=hp.max_len) # for attention plot alignments = tf.transpose(last_state[0].alignment_history.stack(), [1, 2, 0]) return outputs, alignments
def __init__(self, out_units, attention_cell: AttentionRNN, gru_impl=GRUImpl.GRUCell, trainable=True, name=None, **kwargs): super(DecoderRNNV1, self).__init__(name=name, trainable=trainable, **kwargs) self._cell = tf.nn.rnn_cell.MultiRNNCell([ OutputProjectionWrapper(attention_cell, out_units), tf.nn.rnn_cell.ResidualWrapper( gru_cell_factory(gru_impl, out_units)), tf.nn.rnn_cell.ResidualWrapper( gru_cell_factory(gru_impl, out_units)), ], state_is_tuple=True)
def _rnn(self, conv_inputs, output_size, training_placeholder, sequence_lengths, hidden_size, num_layers=4): """ Takes output of convolutional tower and passes it through a multilayer LSTM :param sequence_lengths: vector containing lengths of sequences in batch :param num_layers: number of layers in multilayer RNN (n-1 normal layers and one output layer with projection function) :param conv_inputs: NOT reshaped output of convolutional tower (shaped [batch, l, h, w, channels] :param output_size: number of neurons in last layer, associated with number of possible class_ids :param training_placeholder: a placeholder :return: a tensor [batch_size, timesteps, num_classes] as an output of each timmestep of LSTM """ keep_prob = tf.maximum( 1 - tf.to_float(training_placeholder), 0.85 ) # during training, keep_prob=0.85, during validation keep_prob=1.0 with tf.variable_scope('sequential', initializer=xavier_initializer()): # flatten h, w, c dimensions flattened = tf.squeeze(tf.squeeze(conv_inputs, -2), -2) # only forward LSTMs are used fw_cells = [] for i in range(num_layers): fw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size) if i < num_layers - 1: fw_cell = tf.nn.rnn_cell.DropoutWrapper( fw_cell, output_keep_prob=keep_prob) if i == num_layers - 1: fw_cell = OutputProjectionWrapper(fw_cell, output_size, tf.nn.tanh) fw_cells.append(fw_cell) fw_cells = tf.nn.rnn_cell.MultiRNNCell(fw_cells) outputs, _ = tf.nn.dynamic_rnn(cell=fw_cells, inputs=flattened, sequence_length=sequence_lengths, dtype=tf.float32) return outputs
def single_cell(num_units, is_train, cell_type, dropout=0.0, forget_bias=0.0, dim_project=None): """Create an instance of a single RNN cell.""" # dropout (= 1 - keep_prob) is set to 0 during eval and infer dropout = dropout if is_train else 0.0 # Cell Type if cell_type == "lstm": single_cell = tf.contrib.rnn.LSTMCell(num_units, use_peepholes=True, num_proj=dim_project, cell_clip=50.0, forget_bias=1.0) elif cell_type == "cudnn_lstm": single_cell = tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(num_units) elif cell_type == "gru": single_cell = GRUCell(num_units) elif cell_type == "LSTMBlockCell": single_cell = tf.contrib.rnn.LSTMBlockCell(num_units, forget_bias=forget_bias) elif cell_type == "layer_norm_lstm": single_cell = LayerNormBasicLSTMCell(num_units, forget_bias=forget_bias, layer_norm=True) else: raise ValueError("Unknown unit type %s!" % cell_type) if dim_project: single_cell = OutputProjectionWrapper(cell=single_cell, output_size=dim_project) if dropout > 0.0: single_cell = DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout)) return single_cell
def decode(helper, scope, reuse=None): with tf.variable_scope(scope, reuse=reuse): rnn_layers = [] for i in range(n_decoder_layers): # Create GRUCell with dropout. Do not forget to set the reuse flag properly. cell = tf.nn.rnn_cell.GRUCell(hidden_size, reuse=reuse) cell = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=self.dropout_ph) rnn_layers.append(cell) decoder_cell = MultiRNNCell(rnn_layers) # Create a projection wrapper decoder_cell = OutputProjectionWrapper(decoder_cell, vocab_size, reuse=reuse) # Create BasicDecoder, pass the defined cell, a helper, and initial state # The initial state should be equal to the final state of the encoder! initial_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) decoder = BasicDecoder(decoder_cell, helper, initial_state=initial_state) # The first returning argument of dynamic_decode contains two fields: # * rnn_output (predicted logits) # * sample_id (predictions) max_iters = tf.reduce_max(self.ground_truth_lengths) # max_iters = max_iter outputs, _, _ = dynamic_decode(decoder=decoder, maximum_iterations=max_iters, output_time_major=False, impute_finished=True) return outputs
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings # embedding_table = tf.get_variable( # 'embedding', [len(symbols), 256], dtype=tf.float32, # initializer=tf.truncated_normal_initializer(stddev=0.5)) # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # embedded_inputs = inputs # Encoder # n_fft = (self._hparams.num_src_freq - 1) * 2 # in_layer_size = n_fft in_layer_size = self._hparams.num_src_freq prenet_outputs = prenet(inputs, is_training, layer_sizes=[in_layer_size, 128]) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(GRUCell(256)), ResidualWrapper(GRUCell(256)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' input: %d' % inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, inputs_jp=None, mel_targets=None, linear_targets=None ): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None is_teacher_force_generating = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings # embedding_table = tf.get_variable( # 'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, # initializer=tf.truncated_normal_initializer(stddev=0.5)) # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] if hp.use_gst: #Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Encoder # prenet_outputs = prenet(embedded_inputs, is_training) prenet_outputs = prenet(inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] if inputs_jp is not None: # Reference encoder refnet_outputs = reference_encoder( inputs_jp, filters=hp.reference_filters, kernel_size=(3,3), strides=(2,2), encoder_cell=GRUCell(hp.reference_depth), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs if hp.use_gst: # Style attention style_attention = MultiheadAttention( tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) style_embeddings = style_attention.multi_head_attention() # [N, 1, 256] else: style_embeddings = tf.expand_dims(refnet_outputs, axis=1) # [N, 1, 128] else: print("Use random weight for GST.") random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, name="random_weights") style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]]) # Add style embedding to every text encoder state style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1) # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.rnn_depth), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or is_teacher_force_generating: helper = TacoTrainingHelper(inputs, mel_targets, hp) else: helper = TacoTestHelper(batch_size, hp) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.encoder_outputs = encoder_outputs self.style_embeddings = style_embeddings self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.inputs_jp = inputs_jp log('Initialized Tacotron model. Dimensions: ') log(' style embedding: %d' % style_embeddings.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, reference_mel=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] #Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Encoder encoder_outputs = encoder(embedded_inputs, input_lengths, is_training, 512, 5, 256) # [N, T_in, 256] if is_training: reference_mel = mel_targets if reference_mel is not None: # Reference encoder refnet_outputs = reference_encoder( reference_mel, filters=hp.ref_filters, kernel_size=(3, 3), strides=(2, 2), encoder_cell=GRUCell(hp.ref_depth), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs # Style attention style_attention = MultiheadAttention( tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] tf.tanh( tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1 ])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) embedded_tokens = style_attention.multi_head_attention( ) # [N, 1, 256] else: random_weights = tf.constant( hp.num_heads * [[0] * (hp.gst_index - 1) + [1] + [0] * (hp.num_gst - hp.gst_index)], dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, name="random_weights") # gst_tokens = tf.tile(gst_tokens, [1, hp.num_heads]) embedded_tokens = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) embedded_tokens = hp.gst_scale * embedded_tokens embedded_tokens = tf.reshape( embedded_tokens, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]]) # Add style embedding to every text encoder state style_embeddings = tf.tile( embedded_tokens, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1) # Attention attention_mechanism = LocationSensitiveAttention( 128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length=input_lengths, smoothing=False, cumulate_weights=True) decoder_lstm = [ ZoneoutLSTMCell(1024, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='decoder_LSTM_{}'.format(i + 1)) for i in range(2) ] decoder_lstm = MultiRNNCell(decoder_lstm, state_is_tuple=True) decoder_init_state = decoder_lstm.zero_state( batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음 attention_cell = AttentionWrapper( decoder_lstm, attention_mechanism, initial_cell_state=decoder_init_state, alignment_history=True, output_attention=False) # attention_state_size = 256 # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] # dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) dec_outputs_cell = OutputProjectionWrapper( attention_cell, (hp.num_mels) * hp.outputs_per_step) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp) else: helper = TacoTestHelper(batch_size, hp) decoder_init_state = dec_outputs_cell.zero_state( batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(dec_outputs_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry decoder_mel_outputs = tf.reshape( decoder_outputs[:, :, :hp.num_mels * hp.outputs_per_step], [batch_size, -1, hp.num_mels]) # [N, T_out, M] x = decoder_mel_outputs for i in range(5): activation = tf.nn.tanh if i != (4) else None x = tf.layers.conv1d(x, filters=512, kernel_size=5, padding='same', activation=activation, name='Postnet_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=0.5, training=is_training, name='Postnet_dropout_{}'.format(i)) residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') mel_outputs = decoder_mel_outputs + residual # Add post-processing CBHG: # mel_outputs: (N,T,num_mels) post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_mel_outputs = decoder_mel_outputs self.mel_outputs = mel_outputs self.encoder_outputs = encoder_outputs self.style_embeddings = style_embeddings self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.reference_mel = reference_mel self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions: ') log(' text embedding: %d' % embedded_inputs.shape[-1]) log(' style embedding: %d' % style_embeddings.shape[-1]) # log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) # log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % dec_outputs_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training2 = linear_targets is not None # test에서 이게 True로 되는데, 이게 의도한 것인가??? is_training = not rnn_decoder_test_mode self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: # speaker_embedding_size = f(16) speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed( speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway" ) # 'enc_prenet_sizes': [f(256), f(128)] encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [ get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) for idx in range(hp.dec_layer_num) ] else: deep_dense = lambda x, dim: tf.layers.dense( x, dim, activation=tf.nn.softsign ) # softsign: x / (abs(x) + 1) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다. before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type)) else: # self.num_speakers =1인 경우 speaker_embed = None before_highway = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet' ) # 'enc_prenet_sizes': [f(256), f(128)], dropout_prob = 0.5 # ==> (N, T_in, 128) # enc_rnn_size = 128 encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) # single: attention_size = 128 if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_mon_norm_hccho': attention_mechanism = BahdanauMonotonicAttention_hccho( hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다. # carpedm20은 tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만, keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다. attention_cell = AttentionWrapper( GRUCell(hp.attention_state_size), attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 dec_prenet_outputs = DecoderPrenetWrapper( attention_cell, speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) # dec_prenet_sizes = [f(256), f(128)] # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다. # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ] concat_cell = ConcatOutputAndAttentionWrapper( dec_prenet_outputs, embed_to_concat=speaker_embed ) # concat(output,attention,speaker_embed)해서 새로운 output을 만든다. # Decoder (layers specified bottom to top): dec_rnn_size= 256 cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size) ] # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데... for _ in range(hp.dec_layer_num): # hp.dec_layer_num = 2 cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor ) # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까??? (hp.num_mels+1) * hp.reduction_factor decoder_init_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. ) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training2: # rnn_decoder_test_mode = True if test mode, train mode에서는 False helper = TacoTrainingHelper( inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters) # max_iters=200 # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다. ==> final_decoder_state[0] alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def get_batch(batch, size=5): low = (batch * size) % (40 - size) high = low + size return t_vals[low:high], series[low:high] n_steps = 20 n_inputs = 1 n_neurons = 100 n_outputs = 1 X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) y = tf.placeholder(tf.float32, [None, n_steps, n_outputs]) cell = OutputProjectionWrapper(BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu), output_size=n_outputs) outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32) loss = tf.reduce_mean(tf.square(outputs - y), name='loss') loss_summary = tf.summary.scalar('loss', loss) optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001) training_op = optimizer.minimize(loss) init = tf.global_variables_initializer() batch_size = 100 n_iterations = 20000 with tf.Session() as sess:
def embedding_attention_seq2seq_context(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None): """A seq2seq architecture with two encoders, one for context, one for input DA. The decoder uses twice the cell size. Code adapted from TensorFlow examples.""" with vs.variable_scope(scope or "embedding_attention_seq2seq_context"): # split context and real inputs into separate vectors context_inputs = encoder_inputs[0:len(encoder_inputs) / 2] encoder_inputs = encoder_inputs[len(encoder_inputs) / 2:] # build separate encoders encoder_cell = EmbeddingWrapper(cell, num_encoder_symbols, embedding_size) with vs.variable_scope("context_rnn") as scope: context_outputs, context_states = tf06s2s.rnn(encoder_cell, context_inputs, dtype=dtype, scope=scope) with vs.variable_scope("input_rnn") as scope: encoder_outputs, encoder_states = tf06s2s.rnn(encoder_cell, encoder_inputs, dtype=dtype, scope=scope) # concatenate outputs & states # adding positional arguments and concatenating output, cell and hidden states encoder_outputs = [ array_ops.concat([co, eo], axis=1, name="context-and-encoder-output") for co, eo in zip(context_outputs, encoder_outputs) ] encoder_states = [ (array_ops.concat([c1, c2], axis=1), array_ops.concat([h1, h2], axis=1)) for (c1, h1), (c2, h2) in zip(context_states, encoder_states) ] # calculate a concatenation of encoder outputs to put attention on. top_states = [ array_ops.reshape(e, [-1, 1, cell.output_size * 2]) for e in encoder_outputs ] #added positional arguements as it was taking axis to be the values attention_states = array_ops.concat(axis=1, values=top_states) # change the decoder cell to accommodate wider input # TODO this will work for BasicLSTMCell and GRUCell, but not for others cell = type(cell)(num_units=(cell.output_size * 2)) # Decoder. output_size = None if output_projection is None: cell = OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return tf06s2s.embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, feed_previous) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. outputs1, states1 = tf06s2s.embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, True) vs.get_variable_scope().reuse_variables() outputs2, states2 = tf06s2s.embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, False) outputs = control_flow_ops.cond(feed_previous, lambda: outputs1, lambda: outputs2) states = control_flow_ops.cond(feed_previous, lambda: states1, lambda: states2) return outputs, states
def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x cell.input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Args: encoder_inputs: a list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: a list of 1D int32 Tensors of shape [batch_size]. cell: RNNCell defining the cell function and size. num_encoder_symbols: integer; number of symbols on the encoder side. num_decoder_symbols: integer; number of symbols on the decoder side. num_heads: number of attention heads that read from attention_states. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [cell.output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ with vs.variable_scope(scope or "embedding_attention_seq2seq"): # Encoder. encoder_cell = EmbeddingWrapper(cell, num_encoder_symbols, embedding_size) encoder_outputs, encoder_states = rnn( encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs] attention_states = array_ops.concat(top_states, 1) # Decoder. output_size = None if output_projection is None: cell = OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, feed_previous) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. outputs1, states1 = embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, True) vs.get_variable_scope().reuse_variables() outputs2, states2 = embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, False) outputs = control_flow_ops.cond(feed_previous, lambda: outputs1, lambda: outputs2) states = control_flow_ops.cond(feed_previous, lambda: states1, lambda: states2) return outputs, states
import tensorflow as tf from tensorflow.contrib.layers import fully_connected from tensorflow.contrib.rnn import BasicRNNCell, OutputProjectionWrapper n_steps = 20 n_inputs = 1 n_neurons = 100 n_outputs = 1 X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) y = tf.placeholder(tf.int32, [None]) basic_cell = BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu) cell = OutputProjectionWrapper(basic_cell, output_size=n_outputs) outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32) logits = fully_connected(states, n_outputs, activation_fn=None) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits) learning_rate = 0.001 loss = tf.reduce_mean(cross_entropy) optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.minimize(loss) correct = tf.nn.in_top_k(logits, y, 1) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) init = tf.global_variables_initializer()
def initialize(self, inputs, input_lengths, num_speakers, speaker_id=None, mel_targets=None, linear_targets=None, is_training=False, loss_coeff=None, stop_token_targets=None): with tf.variable_scope('Eembedding') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup(speaker_embed_table, speaker_id) deep_dense = lambda x, dim, name: tf.layers.dense( x, dim, activation=tf.nn.softsign, name=name ) # softsign: x / (abs(x) + 1) encoder_rnn_init_state = deep_dense( speaker_embed, hp.encoder_lstm_units * 4, 'encoder_init_dense') # hp.encoder_lstm_units = 256 decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.decoder_lstm_units * 2, 'decoder_init_dense_{}'.format(i)) for i in range(hp.decoder_layers) ] # hp.decoder_lstm_units = 1024 speaker_embed = None else: # self.num_speakers =1인 경우 speaker_embed = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None with tf.variable_scope('Encoder') as scope: ############## # Encoder ############## x = char_embedded_inputs for i in range(hp.enc_conv_num_layers): x = tf.layers.conv1d(x, filters=hp.enc_conv_channels, kernel_size=hp.enc_conv_kernel_size, padding='same', activation=tf.nn.relu, name='Encoder_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=hp.dropout_prob, training=is_training, name='dropout_{}'.format(i)) if encoder_rnn_init_state is not None: initial_state_fw_c, initial_state_fw_h, initial_state_bw_c, initial_state_bw_h = tf.split( encoder_rnn_init_state, 4, 1) initial_state_fw = LSTMStateTuple(initial_state_fw_c, initial_state_fw_h) initial_state_bw = LSTMStateTuple(initial_state_bw_c, initial_state_bw_h) else: # single mode initial_state_fw, initial_state_bw = None, None cell_fw = ZoneoutLSTMCell( hp.encoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='encoder_fw_LSTM') cell_bw = ZoneoutLSTMCell( hp.encoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='encoder_fw_LSTM') encoder_conv_output = x outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, encoder_conv_output, sequence_length=input_lengths, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, dtype=tf.float32) # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512] encoder_outputs = tf.concat( outputs, axis=2) # Concat and return forward + backward outputs with tf.variable_scope('Decoder') as scope: ############## # Attention ############## if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) decoder_lstm = [ ZoneoutLSTMCell(hp.decoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='decoder_LSTM_{}'.format(i + 1)) for i in range(hp.decoder_layers) ] decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm, state_is_tuple=True) decoder_init_state = decoder_lstm.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "multi-speaker": decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx][0].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1[1] * 2 != shape2[1]: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) c, h = tf.split(cell, 2, 1) decoder_init_state[idx] = LSTMStateTuple(c, h) decoder_init_state = tuple(decoder_init_state) attention_cell = AttentionWrapper( decoder_lstm, attention_mechanism, initial_cell_state=decoder_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] dec_prenet_outputs = DecoderWrapper(attention_cell, is_training, hp.dec_prenet_sizes, hp.dropout_prob, hp.inference_prenet_dropout) dec_outputs_cell = OutputProjectionWrapper( dec_prenet_outputs, (hp.num_mels + 1) * hp.reduction_factor) if is_training: helper = TacoTrainingHelper( mel_targets, hp.num_mels, hp.reduction_factor) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) decoder_init_state = dec_outputs_cell.zero_state( batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(dec_outputs_cell, helper, decoder_init_state),maximum_iterations=int(hp.max_n_frame/hp.reduction_factor)) # max_iters=200 decoder_mel_outputs = tf.reshape( decoder_outputs[:, :, :hp.num_mels * hp.reduction_factor], [batch_size, -1, hp.num_mels ]) # [N,iters,400] -> [N,5*iters,80] stop_token_outputs = tf.reshape( decoder_outputs[:, :, hp.num_mels * hp.reduction_factor:], [batch_size, -1]) # [N,iters] # Postnet x = decoder_mel_outputs for i in range(hp.postnet_num_layers): activation = tf.nn.tanh if i != (hp.postnet_num_layers - 1) else None x = tf.layers.conv1d(x, filters=hp.postnet_channels, kernel_size=hp.postnet_kernel_size, padding='same', activation=activation, name='Postnet_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=hp.dropout_prob, training=is_training, name='Postnet_dropout_{}'.format(i)) residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') mel_outputs = decoder_mel_outputs + residual # Add post-processing CBHG: # mel_outputs: (N,T,num_mels) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') linear_outputs = tf.layers.dense( post_outputs, hp.num_freq, name='linear_spectogram_projection') # [N, T_out, F(1025)] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.decoder_mel_outputs = decoder_mel_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state self.stop_token_targets = stop_token_targets self.stop_token_outputs = stop_token_outputs self.all_vars = tf.trainable_variables() log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) log(' encoder conv out: %d' % encoder_conv_output.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' decoder prenet lstm concat out : %d' % dec_prenet_outputs.output_size) log(' decoder cell out: %d' % dec_outputs_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder mel out: %d' % decoder_mel_outputs.shape[-1]) log(' mel out: %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1]) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, gta=False, locked_alignments=None, logs_enabled=True): '''Initializes the model for inference. Sets "pml_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output ''' with tf.variable_scope('inference') as scope: is_training = pml_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) (multi_decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry decoder_outputs = tf.reshape( multi_decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Postnet: predicts a residual postnet_outputs = postnet(decoder_outputs, layers=hp.postnet_conv_layers, conv_width=hp.postnet_conv_width, channels=hp.postnet_conv_channels, is_training=is_training) pml_outputs = decoder_outputs + postnet_outputs # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, multi_decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % pml_outputs.shape[-1])
def initialize(self, text_inputs, input_lengths, speaker_ids, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: text_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. speaker_ids: int32 Tensor containing ids of specific speakers mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference'): is_training = linear_targets is not None batch_size = tf.shape(text_inputs)[0] hp = self._hparams vocab_size = len(symbols) embedded_inputs = embedding(text_inputs, vocab_size, hp.embedding_dim) # [N, T_in, embd_size] # extract speaker embedding if multi-speaker with tf.variable_scope('speaker'): if hp.num_speakers > 1: speaker_embedding = tf.get_variable('speaker_embed', shape=(hp.num_speakers, hp.speaker_embed_dim), dtype=tf.float32) # TODO: what about special initializer=tf.truncated_normal_initializer(stddev=0.5)? speaker_embd = tf.nn.embedding_lookup(speaker_embedding, speaker_ids) else: speaker_embd = None # Encoder prenet_outputs = prenet(inputs=embedded_inputs, drop_rate=hp.drop_rate if is_training else 0.0, is_training=is_training, layer_sizes=hp.encoder_prenet, scope="prenet") # [N, T_in, 128] encoder_outputs = cbhg(prenet_outputs, input_lengths, speaker_embd=speaker_embd, is_training=is_training, K=hp.encoder_cbhg_banks, c=hp.encoder_cbhg_bank_sizes, # [N, T_in, 256] scope='encoder_cbhg') # Attention Mechanism attention_cell = attention_decoder(encoder_outputs, hp.attention_dim, input_lengths, is_training, speaker_embd=speaker_embd, attention_type=hp.attention_type) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(attention_cell, hp.decoder_dim), # 256 ResidualWrapper(GRUCell(hp.decoder_dim)), # 256 ResidualWrapper(GRUCell(hp.decoder_dim)) # 256 ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(text_inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing post_outputs = cbhg(mel_outputs, None, speaker_embd=None, is_training=is_training, K=hp.post_cbhg_banks, c=hp.post_cbhg_bank_sizes + [hp.num_mels], scope='post_cbhg') # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = text_inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.audio = audio.inv_spectrogram_tensorflow(linear_outputs) self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) # TODO: later work around for getting info back? # log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % attention_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None self.batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 512] # Encoder encoder_outputs = conv_and_lstm( embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, lstm_units=hp.encoder_lstm_units, is_training=is_training, scope='encoder') # [N, T_in, 512] # Attention # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) attention_cell = AttentionWrapper( DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth), is_training), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 128] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ concat_cell, LSTMBlockCell(hp.decoder_lstm_units), LSTMBlockCell(hp.decoder_lstm_units) ], state_is_tuple=True) # [N, T_in, 1024] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(self.batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=self.batch_size, dtype=tf.float32) (multi_decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry [N, T_out, M] decoder_outputs = tf.reshape(multi_decoder_outputs, [self.batch_size, -1, hp.num_mels]) # Postnet: predicts a residual postnet_outputs = postnet( decoder_outputs, layers=hp.postnet_conv_layers, conv_width=hp.postnet_conv_width, channels=hp.postnet_conv_channels, is_training=is_training) mel_outputs = decoder_outputs + postnet_outputs # Convert to linear using a similar architecture as the encoder: expand_outputs = conv_and_lstm( mel_outputs, None, conv_layers=hp.expand_conv_layers, conv_width=hp.expand_conv_width, conv_channels=hp.expand_conv_channels, lstm_units=hp.expand_lstm_units, is_training=is_training, scope='expand') # [N, T_in, 512] linear_outputs = tf.layers.dense(expand_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_outputs = decoder_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' expand out: %d' % expand_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def __init__(self, sess, config, api, log_dir, forward, scope=None): self.vocab = api.vocab self.rev_vocab = api.rev_vocab self.vocab_size = len(self.vocab) self.topic_vocab = api.topic_vocab self.topic_vocab_size = len(self.topic_vocab) self.sess = sess self.scope = scope self.max_utt_len = config.max_utt_len self.go_id = self.rev_vocab["<s>"] self.eos_id = self.rev_vocab["</s>"] self.context_cell_size = config.cxt_cell_size self.sent_cell_size = config.sent_cell_size self.dec_cell_size = config.dec_cell_size self.bow_weights = config.bow_weights with tf.name_scope("io"): # all dialog context and known attributes self.input_contexts = tf.placeholder(dtype=tf.int32, shape=(None, None, self.max_utt_len), name="context") self.context_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="context_lens") # target response given the dialog context self.output_tokens = tf.placeholder(dtype=tf.int32, shape=(None, None), name="output_token") self.output_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="output_lens") self.output_topics = tf.placeholder(dtype=tf.int32, shape=(None, ), name="output_topic") # optimization related variables self.learning_rate = tf.Variable(float(config.init_lr), trainable=False, name="learning_rate") self.learning_rate_decay_op = self.learning_rate.assign( tf.multiply(self.learning_rate, config.lr_decay)) self.global_t = tf.placeholder(dtype=tf.int32, name="global_t") self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior") max_context_len = array_ops.shape(self.input_contexts)[1] max_out_len = array_ops.shape(self.output_tokens)[1] batch_size = array_ops.shape(self.input_contexts)[0] if config.use_hcf: with variable_scope.variable_scope("topicEmbedding"): t_embedding = tf.get_variable( "embedding", [self.topic_vocab_size, config.topic_embed_size], dtype=tf.float32) topic_embedding = embedding_ops.embedding_lookup( t_embedding, self.output_topics) with variable_scope.variable_scope("wordEmbedding"): self.embedding = tf.get_variable( "embedding", [self.vocab_size, config.embed_size], dtype=tf.float32) embedding_mask = tf.constant( [0 if i == 0 else 1 for i in range(self.vocab_size)], dtype=tf.float32, shape=[self.vocab_size, 1]) embedding = self.embedding * embedding_mask input_embedding = embedding_ops.embedding_lookup( embedding, tf.reshape(self.input_contexts, [-1])) input_embedding = tf.reshape( input_embedding, [-1, self.max_utt_len, config.embed_size]) output_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) # context nn if config.sent_type == "bow": input_embedding, sent_size = get_bow(input_embedding) output_embedding, _ = get_bow(output_embedding) elif config.sent_type == "rnn": sent_cell = self.get_rnncell("gru", self.sent_cell_size, config.keep_prob, 1) input_embedding, sent_size = get_rnn_encode(input_embedding, sent_cell, scope="sent_rnn") output_embedding, _ = get_rnn_encode(output_embedding, sent_cell, self.output_lens, scope="sent_rnn", reuse=True) elif config.sent_type == "bi_rnn": fwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) bwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) input_embedding, sent_size = get_bi_rnn_encode( input_embedding, fwd_sent_cell, bwd_sent_cell, scope="sent_bi_rnn") output_embedding, _ = get_bi_rnn_encode(output_embedding, fwd_sent_cell, bwd_sent_cell, self.output_lens, scope="sent_bi_rnn", reuse=True) else: raise ValueError( "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]") # reshape input into dialogs input_embedding = tf.reshape(input_embedding, [-1, max_context_len, sent_size]) if config.keep_prob < 1.0: input_embedding = tf.nn.dropout(input_embedding, config.keep_prob) with variable_scope.variable_scope("contextRNN"): enc_cell = self.get_rnncell(config.cell_type, self.context_cell_size, keep_prob=1.0, num_layer=config.num_layer) # and enc_last_state will be same as the true last state _, enc_last_state = tf.nn.dynamic_rnn( enc_cell, input_embedding, dtype=tf.float32, sequence_length=self.context_lens) if config.num_layer > 1: enc_last_state = tf.concat(enc_last_state, 1) # combine with other attributes if config.use_hcf: attribute_embedding = topic_embedding attribute_fc1 = layers.fully_connected(attribute_embedding, 30, activation_fn=tf.tanh, scope="attribute_fc1") cond_embedding = enc_last_state with variable_scope.variable_scope("recognitionNetwork"): if config.use_hcf: recog_input = tf.concat( [cond_embedding, output_embedding, attribute_fc1], 1) else: recog_input = tf.concat([cond_embedding, output_embedding], 1) self.recog_mulogvar = recog_mulogvar = layers.fully_connected( recog_input, config.latent_size * 2, activation_fn=None, scope="muvar") recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=1) with variable_scope.variable_scope("priorNetwork"): prior_fc1 = layers.fully_connected(cond_embedding, np.maximum( config.latent_size * 2, 100), activation_fn=tf.tanh, scope="fc1") prior_mulogvar = layers.fully_connected(prior_fc1, config.latent_size * 2, activation_fn=None, scope="muvar") prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=1) # use sampled Z or posterior Z latent_sample = tf.cond( self.use_prior, lambda: sample_gaussian(prior_mu, prior_logvar), lambda: sample_gaussian(recog_mu, recog_logvar)) with variable_scope.variable_scope("generationNetwork"): gen_inputs = tf.concat([cond_embedding, latent_sample], 1) # BOW loss bow_fc1 = layers.fully_connected(gen_inputs, 400, activation_fn=tf.tanh, scope="bow_fc1") if config.keep_prob < 1.0: bow_fc1 = tf.nn.dropout(bow_fc1, config.keep_prob) self.bow_logits = layers.fully_connected(bow_fc1, self.vocab_size, activation_fn=None, scope="bow_project") # Y loss if config.use_hcf: meta_fc1 = layers.fully_connected(latent_sample, 400, activation_fn=tf.tanh, scope="meta_fc1") if config.keep_prob < 1.0: meta_fc1 = tf.nn.dropout(meta_fc1, config.keep_prob) self.topic_logits = layers.fully_connected( meta_fc1, self.topic_vocab_size, scope="topic_project") topic_prob = tf.nn.softmax(self.topic_logits) #pred_attribute_embedding = tf.matmul(topic_prob, t_embedding) pred_topic = tf.argmax(topic_prob, 1) pred_attribute_embedding = embedding_ops.embedding_lookup( t_embedding, pred_topic) if forward: selected_attribute_embedding = pred_attribute_embedding else: selected_attribute_embedding = attribute_embedding dec_inputs = tf.concat( [gen_inputs, selected_attribute_embedding], 1) else: self.topic_logits = tf.zeros( (batch_size, self.topic_vocab_size)) selected_attribute_embedding = None dec_inputs = gen_inputs # Decoder if config.num_layer > 1: dec_init_state = [ layers.fully_connected(dec_inputs, self.dec_cell_size, activation_fn=None, scope="init_state-%d" % i) for i in range(config.num_layer) ] dec_init_state = tuple(dec_init_state) else: dec_init_state = layers.fully_connected(dec_inputs, self.dec_cell_size, activation_fn=None, scope="init_state") with variable_scope.variable_scope("decoder"): dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size, config.keep_prob, config.num_layer) dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size) if forward: loop_func = decoder_fn_lib.context_decoder_fn_inference( None, dec_init_state, embedding, start_of_sequence_id=self.go_id, end_of_sequence_id=self.eos_id, maximum_length=self.max_utt_len, num_decoder_symbols=self.vocab_size, context_vector=selected_attribute_embedding) dec_input_embedding = None dec_seq_lens = None else: loop_func = decoder_fn_lib.context_decoder_fn_train( dec_init_state, selected_attribute_embedding) dec_input_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) dec_input_embedding = dec_input_embedding[:, 0:-1, :] dec_seq_lens = self.output_lens - 1 if config.keep_prob < 1.0: dec_input_embedding = tf.nn.dropout( dec_input_embedding, config.keep_prob) # apply word dropping. Set dropped word to 0 if config.dec_keep_prob < 1.0: keep_mask = tf.less_equal( tf.random_uniform((batch_size, max_out_len - 1), minval=0.0, maxval=1.0), config.dec_keep_prob) keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2) dec_input_embedding = dec_input_embedding * keep_mask dec_input_embedding = tf.reshape( dec_input_embedding, [-1, max_out_len - 1, config.embed_size]) dec_outs, _, final_context_state = dynamic_rnn_decoder( dec_cell, loop_func, inputs=dec_input_embedding, sequence_length=dec_seq_lens) if final_context_state is not None: final_context_state = final_context_state[:, 0:array_ops. shape(dec_outs)[1]] mask = tf.to_int32(tf.sign(tf.reduce_max(dec_outs, axis=2))) self.dec_out_words = tf.multiply( tf.reverse(final_context_state, axis=[1]), mask) else: self.dec_out_words = tf.argmax(dec_outs, 2) if not forward: with variable_scope.variable_scope("loss"): labels = self.output_tokens[:, 1:] label_mask = tf.to_float(tf.sign(labels)) rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=dec_outs, labels=labels) rc_loss = tf.reduce_sum(rc_loss * label_mask, reduction_indices=1) self.avg_rc_loss = tf.reduce_mean(rc_loss) # used only for perpliexty calculation. Not used for optimzation self.rc_ppl = tf.exp( tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask)) """ as n-trial multimodal distribution. """ tile_bow_logits = tf.tile(tf.expand_dims(self.bow_logits, 1), [1, max_out_len - 1, 1]) bow_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tile_bow_logits, labels=labels) * label_mask bow_loss = tf.reduce_sum(bow_loss, reduction_indices=1) self.avg_bow_loss = tf.reduce_mean(bow_loss) bow_weights = tf.to_float(self.bow_weights) # reconstruct the meta info about X if config.use_hcf: topic_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.topic_logits, labels=self.output_topics) self.avg_topic_loss = tf.reduce_mean(topic_loss) else: self.avg_topic_loss = 0.0 kld = gaussian_kld(recog_mu, recog_logvar, prior_mu, prior_logvar) self.avg_kld = tf.reduce_mean(kld) if log_dir is not None: kl_weights = tf.minimum( tf.to_float(self.global_t) / config.full_kl_step, 1.0) else: kl_weights = tf.constant(1.0) self.kl_w = kl_weights self.elbo = self.avg_rc_loss + kl_weights * self.avg_kld aug_elbo = bow_weights * self.avg_bow_loss + self.avg_topic_loss + self.elbo tf.summary.scalar("topic_loss", self.avg_topic_loss) tf.summary.scalar("rc_loss", self.avg_rc_loss) tf.summary.scalar("elbo", self.elbo) tf.summary.scalar("kld", self.avg_kld) tf.summary.scalar("bow_loss", self.avg_bow_loss) self.summary_op = tf.summary.merge_all() self.log_p_z = norm_log_liklihood(latent_sample, prior_mu, prior_logvar) self.log_q_z_xy = norm_log_liklihood(latent_sample, recog_mu, recog_logvar) self.est_marginal = tf.reduce_mean(rc_loss + bow_loss - self.log_p_z + self.log_q_z_xy) self.optimize(sess, config, aug_elbo, log_dir) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
def __init__(self, sess, config, api, log_dir, forward, scope=None): self.vocab = api.vocab self.rev_vocab = api.rev_vocab self.vocab_size = len(self.vocab) self.sess = sess self.scope = scope self.max_utt_len = config.max_utt_len self.go_id = self.rev_vocab["<s>"] self.eos_id = self.rev_vocab["</s>"] self.context_cell_size = config.cxt_cell_size self.sent_cell_size = config.sent_cell_size self.dec_cell_size = config.dec_cell_size self.num_topics = config.num_topics with tf.name_scope("io"): # all dialog context and known attributes self.input_contexts = tf.placeholder(dtype=tf.int32, shape=(None, None, self.max_utt_len), name="dialog_context") self.floors = tf.placeholder(dtype=tf.float32, shape=(None, None), name="floor") # TODO float self.floor_labels = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="floor_labels") self.context_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="context_lens") self.paragraph_topics = tf.placeholder(dtype=tf.float32, shape=(None, self.num_topics), name="paragraph_topics") # target response given the dialog context self.output_tokens = tf.placeholder(dtype=tf.int32, shape=(None, None), name="output_token") self.output_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="output_lens") self.output_das = tf.placeholder(dtype=tf.float32, shape=(None, self.num_topics), name="output_dialog_acts") # optimization related variables self.learning_rate = tf.Variable(float(config.init_lr), trainable=False, name="learning_rate") self.learning_rate_decay_op = self.learning_rate.assign( tf.multiply(self.learning_rate, config.lr_decay)) self.global_t = tf.placeholder(dtype=tf.int32, name="global_t") max_dialog_len = array_ops.shape(self.input_contexts)[1] max_out_len = array_ops.shape(self.output_tokens)[1] batch_size = array_ops.shape(self.input_contexts)[0] with variable_scope.variable_scope("wordEmbedding"): self.embedding = tf.get_variable( "embedding", [self.vocab_size, config.embed_size], dtype=tf.float32) embedding_mask = tf.constant( [0 if i == 0 else 1 for i in range(self.vocab_size)], dtype=tf.float32, shape=[self.vocab_size, 1]) embedding = self.embedding * embedding_mask # embed the input input_embedding = embedding_ops.embedding_lookup( embedding, tf.reshape(self.input_contexts, [-1])) input_embedding = tf.reshape( input_embedding, [-1, self.max_utt_len, config.embed_size]) # encode input using RNN w/GRU sent_cell = self.get_rnncell("gru", self.sent_cell_size, config.keep_prob, 1) input_embedding, sent_size = get_rnn_encode(input_embedding, sent_cell, scope="sent_rnn") # reshape input input_embedding = tf.reshape(input_embedding, [-1, max_dialog_len, sent_size]) if config.keep_prob < 1.0: input_embedding = tf.nn.dropout(input_embedding, config.keep_prob) # floor = probability that the next sentence is the last # TODO do we want this? floor = tf.reshape(self.floors, [-1, max_dialog_len, 1]) joint_embedding = tf.concat([input_embedding, floor], 2, "joint_embedding") with variable_scope.variable_scope("contextRNN"): enc_cell = self.get_rnncell(config.cell_type, self.context_cell_size, keep_prob=1.0, num_layer=config.num_layer) # and enc_last_state will be same as the true last state _, enc_last_state = tf.nn.dynamic_rnn( enc_cell, joint_embedding, dtype=tf.float32, sequence_length=self.context_lens) if config.num_layer > 1: if config.cell_type == 'lstm': enc_last_state = [temp.h for temp in enc_last_state] enc_last_state = tf.concat(enc_last_state, 1) else: if config.cell_type == 'lstm': enc_last_state = enc_last_state.h # Final output from the encoder encoded_list = [self.paragraph_topics, enc_last_state] encoded_embedding = tf.concat(encoded_list, 1) with variable_scope.variable_scope("generationNetwork"): # predict whether the next sentence is the last one # TODO do we want this? self.paragraph_end_logits = layers.fully_connected( encoded_embedding, 1, activation_fn=tf.tanh, scope="paragraph_end_fc1") # Decoder if config.num_layer > 1: dec_init_state = [] for i in range(config.num_layer): temp_init = layers.fully_connected(encoded_embedding, self.dec_cell_size, activation_fn=None, scope="init_state-%d" % i) if config.cell_type == 'lstm': # initializer thing for lstm temp_init = rnn_cell.LSTMStateTuple( temp_init, temp_init) dec_init_state.append(temp_init) dec_init_state = tuple(dec_init_state) else: dec_init_state = layers.fully_connected(encoded_embedding, self.dec_cell_size, activation_fn=None, scope="init_state") if config.cell_type == 'lstm': dec_init_state = rnn_cell.LSTMStateTuple( dec_init_state, dec_init_state) with variable_scope.variable_scope("decoder"): dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size, config.keep_prob, config.num_layer) # projects into thing of vocab size. TODO no softmax? dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size) if forward: loop_func = decoder_fn_lib.context_decoder_fn_inference( None, dec_init_state, embedding, start_of_sequence_id=self.go_id, end_of_sequence_id=self.eos_id, maximum_length=self.max_utt_len, num_decoder_symbols=self.vocab_size, context_vector=None) dec_input_embedding = None dec_seq_lens = None else: loop_func = decoder_fn_lib.context_decoder_fn_train( dec_init_state, None) dec_input_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) dec_input_embedding = dec_input_embedding[:, 0:-1, :] dec_seq_lens = self.output_lens - 1 if config.keep_prob < 1.0: dec_input_embedding = tf.nn.dropout( dec_input_embedding, config.keep_prob) # apply word dropping. Set dropped word to 0 if config.dec_keep_prob < 1.0: # get make of keep/throw-away keep_mask = tf.less_equal( tf.random_uniform((batch_size, max_out_len - 1), minval=0.0, maxval=1.0), config.dec_keep_prob) keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2) dec_input_embedding = dec_input_embedding * keep_mask dec_input_embedding = tf.reshape( dec_input_embedding, [-1, max_out_len - 1, config.embed_size]) dec_outs, _, final_context_state = dynamic_rnn_decoder( dec_cell, loop_func, inputs=dec_input_embedding, sequence_length=dec_seq_lens, name='output_node') if final_context_state is not None: final_context_state = final_context_state[:, 0:array_ops. shape(dec_outs)[1]] mask = tf.to_int32(tf.sign(tf.reduce_max(dec_outs, axis=2))) self.dec_out_words = tf.multiply( tf.reverse(final_context_state, axis=[1]), mask) else: self.dec_out_words = tf.argmax(dec_outs, 2) if not forward: with variable_scope.variable_scope("loss"): labels = self.output_tokens[:, 1:] # correct word tokens label_mask = tf.to_float(tf.sign(labels)) # Loss between words print "dec outs shape", dec_outs.get_shape() print "labels shape", labels.get_shape() # Loss between words rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=dec_outs, labels=labels) rc_loss = tf.reduce_sum(rc_loss * label_mask, reduction_indices=1) self.avg_rc_loss = tf.reduce_mean(rc_loss) # used only for perpliexty calculation. Not used for optimzation self.rc_ppl = tf.exp( tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask)) # Predict 0/1 (1 = last sentence in paragraph) end_loss = tf.nn.softmax_cross_entropy_with_logits( labels=self.floor_labels, logits=self.paragraph_end_logits) self.avg_end_loss = tf.reduce_mean(end_loss) print "size of end loss", self.avg_end_loss.get_shape() total_loss = self.avg_rc_loss + self.avg_end_loss tf.summary.scalar("rc_loss", self.avg_rc_loss) tf.summary.scalar("paragraph_end_loss", self.avg_end_loss) self.summary_op = tf.summary.merge_all() self.optimize(sess, config, total_loss, log_dir) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
def initialize(self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False): is_training = linear_targets is not None self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed(speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway") encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \ for idx in range(hp.dec_layer_num)] else: deep_dense = lambda x, dim: \ tf.layers.dense(x, dim, activation=tf.nn.softsign) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unknown multi-speaker model type: {}".format( hp.model_type)) else: speaker_embed = None before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet') # [N, T_in, prenet_depths[-1]=128] encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) # Attention # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) dec_prenet_outputs = DecoderPrenetWrapper( GRUCell(hp.attention_state_size), speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention(hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs) elif hp.attention_type.startswith('ntm2'): shift_width = int(hp.attention_type.split('-')[-1]) attention_mechanism = NTMAttention2(hp.attention_size, encoder_outputs, shift_width=shift_width) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) attention_cell = AttentionWrapper( dec_prenet_outputs, attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. # [N, T_in, attention_size+attention_state_size] concat_cell = ConcatOutputAndAttentionWrapper( attention_cell, embed_to_concat=speaker_embed) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.dec_rnn_size), ResidualWrapper(GRUCell(hp.dec_rnn_size)), ResidualWrapper(GRUCell(hp.dec_rnn_size)), ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception(" [!] Shape {} and {} should be equal". \ format(shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: # [N, T_out, postnet_depth=256] post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_symbols, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None): """Embedding RNN sequence-to-sequence model with tied (shared) parameters. This model first embeds encoder_inputs by a newly created embedding (of shape [num_symbols x cell.input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs using the same embedding. Then it runs RNN decoder, initialized with the last encoder state, on embedded decoder_inputs. Args: encoder_inputs: a list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: a list of 1D int32 Tensors of shape [batch_size]. cell: RNNCell defining the cell function and size. num_symbols: integer; number of symbols for both encoder and decoder. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [cell.output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype to use for the initial RNN states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_tied_rnn_seq2seq". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when output_projection has the wrong shape. """ if output_projection is not None: proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) proj_weights.get_shape().assert_is_compatible_with([cell.output_size, num_symbols]) proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with vs.variable_scope(scope or "embedding_tied_rnn_seq2seq"): with ops.device("/cpu:0"): embedding = vs.get_variable("embedding", [num_symbols, cell.input_size]) emb_encoder_inputs = [embedding_ops.embedding_lookup(embedding, x) for x in encoder_inputs] emb_decoder_inputs = [embedding_ops.embedding_lookup(embedding, x) for x in decoder_inputs] def extract_argmax_and_embed(prev, _): """Loop_function that extracts the symbol from prev and embeds it.""" if output_projection is not None: prev = nn_ops.xw_plus_b( prev, output_projection[0], output_projection[1]) prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1)) return embedding_ops.embedding_lookup(embedding, prev_symbol) if output_projection is None: cell = OutputProjectionWrapper(cell, num_symbols) if isinstance(feed_previous, bool): loop_function = extract_argmax_and_embed if feed_previous else None return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=loop_function, dtype=dtype) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. outputs1, states1 = tied_rnn_seq2seq( emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=extract_argmax_and_embed, dtype=dtype) vs.get_variable_scope().reuse_variables() outputs2, states2 = tied_rnn_seq2seq( emb_encoder_inputs, emb_decoder_inputs, cell, dtype=dtype) outputs = control_flow_ops.cond(feed_previous, lambda: outputs1, lambda: outputs2) states = control_flow_ops.cond(feed_previous, lambda: states1, lambda: states2) return outputs, states
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False, locked_alignments=None, logs_enabled=True): """ Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output """ # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments if locked_alignments_ is not None: if np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder encoder_outputs = conv_and_gru( # [N, T_in, 2*encoder_gru_units=512] embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, gru_units_unidirectional=hp.encoder_gru_units, is_training=is_training, scope='encoder', ) # Attention attention_cell = LockableAttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper') # [N, T_in, attention_depth=256] # Concatenate attention context vector and RNN cell output into a # 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): cells = [ GRUCell(hp.decoder_gru_units) for _ in range(hp.decoder_gru_layers) ] decoder_cell = MultiRNNCell( [concat_cell] + cells, state_is_tuple=True) # [N, T_in, decoder_depth=1024] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) if is_training or gta: if hp.scheduled_sampling: helper = TacoScheduledOutputTrainingHelper( inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step, hp.scheduled_sampling_probability) else: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add Post-Processing Conv and GRU layer: expand_outputs = conv_and_gru( # [N, T_in, 2*expand_gru_units=512] pml_intermediates, None, conv_layers=hp.expand_conv_layers, conv_width=hp.expand_conv_width, conv_channels=hp.expand_conv_channels, gru_units_unidirectional=hp.expand_gru_units, is_training=is_training, scope='expand', ) pml_outputs = tf.layers.dense(expand_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_intermediates = pml_intermediates self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets if logs_enabled: log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(is_training)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' Decoder out (1 frame): {}'.format( pml_intermediates.shape[-1])) log(' Expand out: {}'.format( expand_outputs.shape[-1])) log(' PML out: {}'.format( pml_outputs.shape[-1]))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, hp.encoder_depth) # [N, T_in, encoder_depth=256] # Attention attention_mechanism = LocationSensitiveAttention( hp.attention_depth, encoder_outputs) # [N, T_in, attention_depth=256] # Decoder (layers specified bottom to top): multi_rnn_cell = MultiRNNCell( [ ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): decoder_cell = TacotronDecoderWrapper(is_training, attention_mechanism, multi_rnn_cell) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder( OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step), helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hp.num_mels, is_training, hp.postnet_depth) # [N, T_out, postnet_depth=256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) # log(' attention out: %d' % attention_cell.output_size) # log(' concat attn & out: %d' % concat_cell.output_size) # log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def decoder(inputs, encoder_outputs, is_training, batch_size, mel_targets): """ Decoder Prenet -> Attention RNN Postprocessing CBHG @param encoder_outputs outputs from the encoder wtih shape [N, T_in, prenet_depth=256] @param inputs int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs @param is_training flag for training or eval @param batch_size number of samples per batch @param mel_targets float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel @param output_cell attention cell @param decoder_init_state initial state of the decoder @return linear_outputs, mel_outputs and alignments """ if (is_training): helper = TacoTrainingHelper(inputs, mel_targets, hparams.num_mels, hparams.outputs_per_step) else: helper = TacoTestHelper(batch_size, hparams.num_mels, hparams.outputs_per_step) # Attention attention_cell = AttentionWrapper( GRUCell(hparams.attention_depth), BahdanauAttention(hparams.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hparams.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hparams.decoder_depth), ResidualWrapper(GRUCell(hparams.decoder_depth)), ResidualWrapper(GRUCell(hparams.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hparams.num_mels * hparams.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hparams.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hparams.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hparams.num_mels, is_training, # [N, T_out, postnet_depth=256] hparams.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hparams.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) log('Decoder Network ...') log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hparams.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1]) return linear_outputs, mel_outputs, alignments
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings symbols_length = 149 # BASED ON PREVIOUS LENGTH OF LIST embedding_table = tf.get_variable( 'embedding', [symbols_length, hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hp.num_mels, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets
def __init__(self, sess, config, api, log_dir, forward, scope=None): # forward??? self.vocab = api.vocab self.rev_vocab = api.rev_vocab self.vocab_size = len(self.vocab) self.seen_intent = api.seen_intent self.rev_seen_intent = api.rev_seen_intent self.seen_intent_size = len(self.rev_seen_intent) self.unseen_intent = api.unseen_intent self.rev_unseen_intent = api.rev_unseen_intent self.unseen_intent_size = len(self.rev_unseen_intent) self.sess = sess self.scope = scope self.max_utt_len = config.max_utt_len self.go_id = self.rev_vocab["<s>"] self.eos_id = self.rev_vocab["</s>"] self.sent_cell_size = config.sent_cell_size self.dec_cell_size = config.dec_cell_size self.label_embed_size = config.label_embed_size self.latent_size = config.latent_size self.seed = config.seed self.use_ot_label = config.use_ot_label self.use_rand_ot_label = config.use_rand_ot_label # Only valid if use_ot_label is true, whether use all other label self.use_rand_fixed_ot_label = config.use_rand_fixed_ot_label # valid when use_ot_label=true and use_rand_ot_label=true if self.use_ot_label: self.rand_ot_label_num = config.rand_ot_label_num # valid when use_ot_label=true and use_rand_ot_label=true else: self.rand_ot_label_num = self.seen_intent_size - 1 with tf.name_scope("io"): # all dialog context and known attributes self.labels = tf.placeholder( dtype=tf.int32, shape=(None, ), name="labels") # each utterance have a label, [batch_size,] self.ot_label_rand = tf.placeholder(dtype=tf.int32, shape=(None, None), name="ot_labels_rand") self.ot_labels_all = tf.placeholder( dtype=tf.int32, shape=(None, None), name="ot_labels_all") #(batch_size, len(api.label_vocab)-1) # target response given the dialog context self.io_tokens = tf.placeholder(dtype=tf.int32, shape=(None, None), name="output_tokens") self.io_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="output_lens") self.output_labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name="output_labels") # optimization related variables self.learning_rate = tf.Variable(float(config.init_lr), trainable=False, name="learning_rate") self.learning_rate_decay_op = self.learning_rate.assign( tf.multiply(self.learning_rate, config.lr_decay)) self.global_t = tf.placeholder(dtype=tf.int32, name="global_t") self.use_prior = tf.placeholder( dtype=tf.bool, name="use_prior") # whether use prior self.prior_mulogvar = tf.placeholder( dtype=tf.float32, shape=(None, config.latent_size * 2), name="prior_mulogvar") self.batch_size = tf.placeholder(dtype=tf.int32, name="batch_size") max_out_len = array_ops.shape(self.io_tokens)[1] # batch_size = array_ops.shape(self.io_tokens)[0] batch_size = self.batch_size with variable_scope.variable_scope("labelEmbedding", reuse=tf.AUTO_REUSE): self.la_embedding = tf.get_variable( "embedding", [self.seen_intent_size, config.label_embed_size], dtype=tf.float32) label_embedding = embedding_ops.embedding_lookup( self.la_embedding, self.output_labels) # not use with variable_scope.variable_scope("wordEmbedding", reuse=tf.AUTO_REUSE): self.embedding = tf.get_variable( "embedding", [self.vocab_size, config.embed_size], dtype=tf.float32, trainable=False) embedding_mask = tf.constant( [0 if i == 0 else 1 for i in range(self.vocab_size)], dtype=tf.float32, shape=[self.vocab_size, 1]) embedding = self.embedding * embedding_mask # boardcast, first row is all 0. io_embedding = embedding_ops.embedding_lookup( embedding, self.io_tokens) # 3 dim if config.sent_type == "bow": io_embedding, _ = get_bow(io_embedding) elif config.sent_type == "rnn": sent_cell = self.get_rnncell("gru", self.sent_cell_size, config.keep_prob, 1) io_embedding, _ = get_rnn_encode(io_embedding, sent_cell, self.io_lens, scope="sent_rnn", reuse=tf.AUTO_REUSE) elif config.sent_type == "bi_rnn": fwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) bwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) io_embedding, _ = get_bi_rnn_encode( io_embedding, fwd_sent_cell, bwd_sent_cell, self.io_lens, scope="sent_bi_rnn", reuse=tf.AUTO_REUSE ) # equal to x of the graph, (batch_size, 300*2) else: raise ValueError( "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]") # print('==========================', io_embedding) # Tensor("models_2/wordEmbedding/sent_bi_rnn/concat:0", shape=(?, 600), dtype=float32) # convert label into 1 hot my_label_one_hot = tf.one_hot(tf.reshape(self.labels, [-1]), depth=self.seen_intent_size, dtype=tf.float32) # 2 dim if config.use_ot_label: if config.use_rand_ot_label: ot_label_one_hot = tf.one_hot(tf.reshape( self.ot_label_rand, [-1]), depth=self.seen_intent_size, dtype=tf.float32) ot_label_one_hot = tf.reshape( ot_label_one_hot, [-1, self.seen_intent_size * self.rand_ot_label_num]) else: ot_label_one_hot = tf.one_hot(tf.reshape( self.ot_labels_all, [-1]), depth=self.seen_intent_size, dtype=tf.float32) ot_label_one_hot = tf.reshape( ot_label_one_hot, [ -1, self.seen_intent_size * (self.seen_intent_size - 1) ] ) # (batch_size, len(api.label_vocab)*(len(api.label_vocab)-1)) with variable_scope.variable_scope("recognitionNetwork", reuse=tf.AUTO_REUSE): recog_input = io_embedding self.recog_mulogvar = recog_mulogvar = layers.fully_connected( recog_input, config.latent_size * 2, activation_fn=None, scope="muvar") # config.latent_size=200 recog_mu, recog_logvar = tf.split( recog_mulogvar, 2, axis=1 ) # recognition network output. (batch_size, config.latent_size) with variable_scope.variable_scope("priorNetwork", reuse=tf.AUTO_REUSE): # p(xyz) = p(z)p(x|z)p(y|xz) # prior network parameter, assum the normal distribution # prior_mulogvar = tf.constant([[1] * config.latent_size + [0] * config.latent_size]*batch_size, # dtype=tf.float32, name="muvar") # can not use by this manner prior_mulogvar = self.prior_mulogvar prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=1) # use sampled Z or posterior Z latent_sample = tf.cond( self.use_prior, # bool input lambda: sample_gaussian(prior_mu, prior_logvar ), # equal to shape(prior_logvar) lambda: sample_gaussian(recog_mu, recog_logvar) ) # if ... else ..., (batch_size, config.latent_size) self.z = latent_sample with variable_scope.variable_scope("generationNetwork", reuse=tf.AUTO_REUSE): bow_loss_inputs = latent_sample # (part of) response network input label_inputs = latent_sample dec_inputs = latent_sample # BOW loss if config.use_bow_loss: bow_fc1 = layers.fully_connected( bow_loss_inputs, 400, activation_fn=tf.tanh, scope="bow_fc1") # MLPb network fc layer # error1:ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`. if config.keep_prob < 1.0: bow_fc1 = tf.nn.dropout(bow_fc1, config.keep_prob) self.bow_logits = layers.fully_connected( bow_fc1, self.vocab_size, activation_fn=None, scope="bow_project") # MLPb network fc output # Y loss, include the other y. my_label_fc1 = layers.fully_connected(label_inputs, 400, activation_fn=tf.tanh, scope="my_label_fc1") if config.keep_prob < 1.0: my_label_fc1 = tf.nn.dropout(my_label_fc1, config.keep_prob) # my_label_fc2 = layers.fully_connected(my_label_fc1, 400, activation_fn=tf.tanh, scope="my_label_fc2") # if config.keep_prob < 1.0: # my_label_fc2 = tf.nn.dropout(my_label_fc2, config.keep_prob) self.my_label_logits = layers.fully_connected( my_label_fc1, self.seen_intent_size, scope="my_label_project") # MLPy fc output my_label_prob = tf.nn.softmax( self.my_label_logits ) # softmax output, (batch_size, label_vocab_size) self.my_label_prob = my_label_prob pred_my_label_embedding = tf.matmul( my_label_prob, self.la_embedding ) # predicted my label y. (batch_size, label_embed_size) if config.use_ot_label: if config.use_rand_ot_label: # use one random other label ot_label_fc1 = layers.fully_connected( label_inputs, 400, activation_fn=tf.tanh, scope="ot_label_fc1") if config.keep_prob < 1.0: ot_label_fc1 = tf.nn.dropout(ot_label_fc1, config.keep_prob) self.ot_label_logits = layers.fully_connected( ot_label_fc1, self.rand_ot_label_num * self.seen_intent_size, scope="ot_label_rand_project") ot_label_logits_split = tf.reshape( self.ot_label_logits, [-1, self.rand_ot_label_num, self.seen_intent_size]) ot_label_prob_short = tf.nn.softmax(ot_label_logits_split) ot_label_prob = tf.reshape( ot_label_prob_short, [-1, self.rand_ot_label_num * self.seen_intent_size] ) # (batch_size, self.rand_ot_label_num*self.label_vocab_size) pred_ot_label_embedding = tf.reshape( tf.matmul(ot_label_prob_short, self.la_embedding), [self.label_embed_size * self.rand_ot_label_num ]) # predicted other label y2. else: ot_label_fc1 = layers.fully_connected( label_inputs, 400, activation_fn=tf.tanh, scope="ot_label_fc1") if config.keep_prob < 1.0: ot_label_fc1 = tf.nn.dropout(ot_label_fc1, config.keep_prob) self.ot_label_logits = layers.fully_connected( ot_label_fc1, self.seen_intent_size * (self.seen_intent_size - 1), scope="ot_label_all_project") ot_label_logits_split = tf.reshape( self.ot_label_logits, [-1, self.seen_intent_size - 1, self.seen_intent_size]) ot_label_prob_short = tf.nn.softmax(ot_label_logits_split) ot_label_prob = tf.reshape( ot_label_prob_short, [ -1, self.seen_intent_size * (self.seen_intent_size - 1) ] ) # (batch_size, self.label_vocab_size*(self.label_vocab_size-1)) pred_ot_label_embedding = tf.reshape( tf.matmul(ot_label_prob_short, self.la_embedding), [self.label_embed_size * (self.seen_intent_size - 1)] ) # predicted other all label y. (batch_size, self.label_embed_size*(self.label_vocab_size-1)) # note:matmul can calc (3, 4, 5) × (5, 4) = (3, 4, 4) else: # only use label y. self.ot_label_logits = None pred_ot_label_embedding = None # Decoder, Response Network if config.num_layer > 1: dec_init_state = [] for i in range(config.num_layer): temp_init = layers.fully_connected(dec_inputs, self.dec_cell_size, activation_fn=None, scope="init_state-%d" % i) if config.cell_type == 'lstm': temp_init = rnn_cell.LSTMStateTuple( temp_init, temp_init) dec_init_state.append(temp_init) dec_init_state = tuple(dec_init_state) else: dec_init_state = layers.fully_connected(dec_inputs, self.dec_cell_size, activation_fn=None, scope="init_state") if config.cell_type == 'lstm': dec_init_state = rnn_cell.LSTMStateTuple( dec_init_state, dec_init_state) with variable_scope.variable_scope("decoder", reuse=tf.AUTO_REUSE): dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size, config.keep_prob, config.num_layer) dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size) if forward: # test loop_func = decoder_fn_lib.context_decoder_fn_inference( None, dec_init_state, embedding, start_of_sequence_id=self.go_id, end_of_sequence_id=self.eos_id, maximum_length=self.max_utt_len, num_decoder_symbols=self.vocab_size, context_vector=None) # a function dec_input_embedding = None dec_seq_lens = None else: # train loop_func = decoder_fn_lib.context_decoder_fn_train( dec_init_state, None) dec_input_embedding = embedding_ops.embedding_lookup( embedding, self.io_tokens ) # x 's embedding (batch_size, utt_len, embed_size) dec_input_embedding = dec_input_embedding[:, 0: -1, :] # ignore the last </s> dec_seq_lens = self.io_lens - 1 # input placeholder if config.keep_prob < 1.0: dec_input_embedding = tf.nn.dropout( dec_input_embedding, config.keep_prob) # apply word dropping. Set dropped word to 0 if config.dec_keep_prob < 1.0: keep_mask = tf.less_equal( tf.random_uniform((batch_size, max_out_len - 1), minval=0.0, maxval=1.0), config.dec_keep_prob) keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2) dec_input_embedding = dec_input_embedding * keep_mask dec_input_embedding = tf.reshape( dec_input_embedding, [-1, max_out_len - 1, config.embed_size]) # print("=======", dec_input_embedding) # Tensor("models/decoder/strided_slice:0", shape=(?, ?, 200), dtype=float32) dec_outs, _, final_context_state = dynamic_rnn_decoder( dec_cell, loop_func, inputs=dec_input_embedding, sequence_length=dec_seq_lens ) # dec_outs [batch_size, seq, features] if final_context_state is not None: final_context_state = final_context_state[:, 0:array_ops. shape(dec_outs)[1]] mask = tf.to_int32(tf.sign(tf.reduce_max( dec_outs, axis=2))) # get softmax vec's max index self.dec_out_words = tf.multiply( tf.reverse(final_context_state, axis=[1]), mask) else: self.dec_out_words = tf.argmax( dec_outs, 2) # (batch_size, utt_len), each element is index of word if not forward: with variable_scope.variable_scope("loss", reuse=tf.AUTO_REUSE): labels = self.io_tokens[:, 1:] # not include the first word <s>, (batch_size, utt_len) label_mask = tf.to_float(tf.sign(labels)) labels = tf.one_hot(labels, depth=self.vocab_size, dtype=tf.float32) print(dec_outs) print(labels) # Tensor("models_1/decoder/dynamic_rnn_decoder/transpose_1:0", shape=(?, ?, 892), dtype=float32) # Tensor("models_1/loss/strided_slice:0", shape=(?, ?), dtype=int32) # rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=dec_outs, labels=labels) # response network loss rc_loss = tf.nn.softmax_cross_entropy_with_logits( logits=dec_outs, labels=labels) # response network loss # logits_size=[390,892] labels_size=[1170,892] rc_loss = tf.reduce_sum( rc_loss * label_mask, reduction_indices=1) # (batch_size,), except the word unk self.avg_rc_loss = tf.reduce_mean(rc_loss) # scalar # used only for perpliexty calculation. Not used for optimzation self.rc_ppl = tf.exp( tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask)) """ as n-trial multimodal distribution. """ tile_bow_logits = tf.tile( tf.expand_dims(self.bow_logits, 1), [1, max_out_len - 1, 1 ]) # (batch_size, max_out_len-1, vocab_size) bow_loss = tf.nn.softmax_cross_entropy_with_logits( logits=tile_bow_logits, labels=labels ) * label_mask # labels shape less than logits shape, (batch_size, max_out_len-1) bow_loss = tf.reduce_sum(bow_loss, reduction_indices=1) # (batch_size, ) self.avg_bow_loss = tf.reduce_mean(bow_loss) # scalar # the label y my_label_loss = tf.nn.softmax_cross_entropy_with_logits( logits=my_label_prob, labels=my_label_one_hot) # label (batch_size,) self.avg_my_label_loss = tf.reduce_mean(my_label_loss) if config.use_ot_label: ot_label_loss = -tf.nn.softmax_cross_entropy_with_logits( logits=ot_label_prob, labels=ot_label_one_hot) self.avg_ot_label_loss = tf.reduce_mean(ot_label_loss) else: self.avg_ot_label_loss = 0.0 kld = gaussian_kld( recog_mu, recog_logvar, prior_mu, prior_logvar) # kl divergence, (batch_size,) self.avg_kld = tf.reduce_mean(kld) # scalar if log_dir is not None: kl_weights = tf.minimum( tf.to_float(self.global_t) / config.full_kl_step, 1.0) else: kl_weights = tf.constant(1.0) self.kl_w = kl_weights self.elbo = self.avg_rc_loss + kl_weights * self.avg_kld # Restructure loss and kl divergence #=====================================================================================================total loss====================================================# if config.use_rand_ot_label: aug_elbo = self.avg_bow_loss + 1000 * self.avg_my_label_loss + 10 * self.avg_ot_label_loss + self.elbo # augmented loss # (1/self.rand_ot_label_num)* else: aug_elbo = self.avg_bow_loss + 1000 * self.avg_my_label_loss + 10 * self.avg_ot_label_loss + self.elbo # augmented loss # (1/(self.label_vocab_size-1))* tf.summary.scalar("rc_loss", self.avg_rc_loss) tf.summary.scalar("elbo", self.elbo) tf.summary.scalar("kld", self.avg_kld) tf.summary.scalar("bow_loss", self.avg_bow_loss) tf.summary.scalar("my_label_loss", self.avg_my_label_loss) tf.summary.scalar("ot_label_loss", self.avg_ot_label_loss) self.summary_op = tf.summary.merge_all() self.log_p_z = norm_log_liklihood(latent_sample, prior_mu, prior_logvar) # probability self.log_q_z_xy = norm_log_liklihood( latent_sample, recog_mu, recog_logvar) # probability self.est_marginal = tf.reduce_mean(rc_loss + bow_loss - self.log_p_z + self.log_q_z_xy) self.optimize(sess, config, aug_elbo, log_dir) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2) print('model establish finish!')