def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] if is_training: # Reference encoder reference_embedding = reference_encoder( mel_targets, filters=[32, 32, 64, 64, 128, 128], kernel_size=(3, 3), strides=(2, 2), is_training=is_training) # Style token layer style_embedding = multi_head_attention( num_heads=hp.num_heads, queries=tf.expand_dims(reference_embedding, axis=1), # [N, 1, 128] memory=tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1]), # [N, hp.num_gst, 256] num_units=128) else: # TODO Add support for reference mode and more effective style control during inference. # Randomly select style embedding from gst_tokens for simplicity. random_index = tf.random_uniform([batch_size], maxval=hp.num_gst, dtype=tf.int32) style_embedding = tf.nn.embedding_lookup( gst_tokens, random_index) # Add style embedding to every text encoder state, applying tanh to # compress both encoder state and style embedding to the same scale. encoder_outputs += tf.nn.tanh(style_embedding) # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper( ZoneoutWrapper(LSTMBlockCell(256), (0.1, 0.1), is_training)), ResidualWrapper( ZoneoutWrapper(LSTMBlockCell(256), (0.1, 0.1), is_training)), ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) if is_training: (decoder_outputs, _ ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state)) else: (decoder_outputs, _ ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets tf.logging.info('Initialized Tacotron model. Dimensions: ') tf.logging.info(' embedding: %d' % embedded_inputs.shape[-1]) tf.logging.info(' prenet out: %d' % prenet_outputs.shape[-1]) tf.logging.info(' encoder out: %d' % encoder_outputs.shape[-1]) tf.logging.info(' attention out: %d' % attention_cell.output_size) tf.logging.info(' concat attn & out: %d' % concat_cell.output_size) tf.logging.info(' decoder cell out: %d' % decoder_cell.output_size) tf.logging.info(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) tf.logging.info(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) tf.logging.info(' postnet out: %d' % post_outputs.shape[-1]) tf.logging.info(' linear out: %d' % linear_outputs.shape[-1])
def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x cell.input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Args: encoder_inputs: a list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: a list of 1D int32 Tensors of shape [batch_size]. cell: RNNCell defining the cell function and size. num_encoder_symbols: integer; number of symbols on the encoder side. num_decoder_symbols: integer; number of symbols on the decoder side. num_heads: number of attention heads that read from attention_states. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [cell.output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ with vs.variable_scope(scope or "embedding_attention_seq2seq"): # Encoder. encoder_cell = EmbeddingWrapper(cell, num_encoder_symbols, embedding_size) encoder_outputs, encoder_states = rnn(encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs ] attention_states = array_ops.concat(top_states, 1) # Decoder. output_size = None if output_projection is None: cell = OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, feed_previous) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. outputs1, states1 = embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, True) vs.get_variable_scope().reuse_variables() outputs2, states2 = embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, False) outputs = control_flow_ops.cond(feed_previous, lambda: outputs1, lambda: outputs2) states = control_flow_ops.cond(feed_previous, lambda: states1, lambda: states2) return outputs, states
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, reference_mel=None, reference_weight=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None is_teacher_force_generating = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] if hp.use_gst: #Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] if is_training: reference_mel = mel_targets if reference_mel is not None: # Reference encoder refnet_outputs = reference_encoder( reference_mel, filters=hp.reference_filters, kernel_size=(3, 3), strides=(2, 2), encoder_cell=GRUCell(hp.reference_depth), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs if hp.use_gst: # Style attention style_attention = MultiheadAttention( tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] tf.tanh( tf.tile( tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1 ])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) style_weights, style_embeddings = style_attention.multi_head_attention( ) # [N, 1, 256] else: style_embeddings = tf.expand_dims(refnet_outputs, axis=1) # [N, 1, 128] elif reference_weight is not None: print("Use specific weight for GST.") specific_weights = tf.expand_dims(reference_weight, axis=0) specific_weights = tf.tile(specific_weights, [hp.num_heads, 1], name="specific_weights") # specific_weights = tf.tile(specific_weights, [hp.num_heads, 1]) # specific_weights = tf.nn.softmax(specific_weights, axis=-1, name="specific_weights") style_embeddings = tf.matmul(specific_weights, tf.nn.tanh(gst_tokens)) style_embeddings = tf.expand_dims(style_embeddings, axis=0) style_embeddings = tf.tile(style_embeddings, [batch_size, 1, 1]) style_embeddings = tf.reshape( style_embeddings, shape=[batch_size, 1, hp.style_embed_depth]) style_weights = tf.expand_dims(specific_weights, axis=0) else: print("Use random weight for GST.") random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, axis=-1, name="random_weights") style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) style_embeddings = tf.expand_dims(style_embeddings, axis=0) style_embeddings = tf.tile(style_embeddings, [batch_size, 1, 1]) style_embeddings = tf.reshape( style_embeddings, shape=[batch_size, 1, hp.style_embed_depth]) style_weights = tf.expand_dims(random_weights, axis=0) # Add style embedding to every text encoder state style_embeddings = tf.tile( style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] # encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1) encoder_outputs = encoder_outputs + style_embeddings # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.rnn_depth), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or is_teacher_force_generating: helper = TacoTrainingHelper(inputs, mel_targets, hp) else: helper = TacoTestHelper(batch_size, hp) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.encoder_outputs = encoder_outputs self.style_weights = style_weights self.style_embeddings = style_embeddings self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.reference_mel = reference_mel self.reference_weight = reference_weight log('Initialized Tacotron model. Dimensions: ') log(' text embedding: %d' % embedded_inputs.shape[-1]) log(' style embedding: %d' % style_embeddings.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), 256], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # Encoder prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, 256), #ResidualWrapper(GRUCell(256)), #ResidualWrapper(GRUCell(256)) ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(256), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training = linear_targets is not None # linear_targets가 초기값(None)이면 False self.is_randomly_initialized = is_randomly_initialized # 초기값 False with tf.variable_scope('inference') as scope: # 'inference'라는 이름으로 묶음 hp = self._hparams batch_size = tf.shape(inputs)[ 0] # 첫번째 차원은 샘플 수, 두번째 차원은 입력 특성 수 (여기선 샘플수) # Embeddings char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf. float32, # list : variable이 소속될 collection에 대한 리스트 한글의 종류수와 임베딩 크기에 속해있다. , 'embedding이라는 이름의 공유 변수 생성 initializer=tf.truncated_normal_initializer(stddev=0.5) ) # initializer : 초기화한 가중치 dtype : 리턴한 tensor의 타입 # [N, T_in, embedding_size] char_embedded_inputs = \ tf.nn.embedding_lookup(char_embed_table, inputs) # inputs의 인덱스에 따라 char_embed_table값 리턴 self.num_speakers = num_speakers if self.num_speakers > 1: # 다중화자일때 if hp.speaker_embedding_size != 1: # hparams의 speaker_embedding_size값이 1이 아닐때 speaker_embed_table = tf.get_variable( # 공유변수 생성 'speaker_embedding', # 'speaker_embedding'이라는 이름의 [self.num_speakers, hp.speaker_embedding_size], dtype=tf. float32, # num_speakers와 speaker_embedding_size에 속해있는 initializer=tf.truncated_normal_initializer( stddev=0.5)) # 초기화값 가중치 # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id ) # speaker의 인덱스에 따라 speaker_embed_table값 리턴 (Tensor) ############################################################## 추가설명 필요 if hp.model_type == 'deepvoice': # deepvoice일때 if hp.speaker_embedding_size == 1: # hparams의 speaker_embedding_size값이 1일때 before_highway = get_embed( # def get_embed(inputs, num_inputs, embed_size, name): speaker_id, self. num_speakers, # speaker_id의 인덱스에 따라 embed_table값 리턴 hp.enc_prenet_sizes[-1], "before_highway") encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \ for idx in range(hp.dec_layer_num)] ############################################################## else: # hparams의 speaker_embedding_size값이 1이 아닐때 deep_dense = lambda x, dim: \ tf.layers.dense(x, dim, activation=tf.nn.softsign) # input:x, units:dim, 활성화함수로 softsign사용 # lambda함수 예제 (lambda x,y: x + y)(10, 20) =>> 30 # tf.layers.dense( inputs, units, activation) # inputs는 앞의 레이어를 정의 # units는 이 레이어에 크기를 정의 # 마지막으로 activation은 sigmoid나,ReLu와 같은 Activation 함수 # dense는 히든레이어를 구현하는 함수이다. # https://bcho.tistory.com/1196 before_highway = deep_dense( speaker_embed, hp.enc_prenet_sizes[-1] ) # 앞 레이어 : speaker_embed 레이어 수 : hp.enc_prenet_sizes[-1] (기본값 128) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2 ) # 앞 레이어 : speaker_embed 레이어 수 : hp.enc_rnn_size * 2 (기본값 128 * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size ) # 앞 레이어 : speaker_embed 레이어 수 : hp.attention_state_size (기본값 256) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] # hp.dec_layer_num 수만큼 (기본값 2) 레이어 list speaker_embed = None # deepvoice does not use speaker_embed directly 딥보이스는 speaker_embed를 바로 사용하지 않는다. elif hp.model_type == 'simple': # modeltype이 deepvoice가 아니라 simple일때 before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None # 레이어 전부 x else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type) ) # multi-speaker model type이 아니라고 에러메세지 출력 else: # 스피커의 수가 1명이면 speaker_embed = None before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None # 레이어 전부 x ############## # Encoder (특수문자, 한글 자모음text를 숫자로) ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, # hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet') encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention (중요!) ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) dec_prenet_outputs = DecoderPrenetWrapper( GRUCell(hp.attention_state_size), speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention(hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs) elif hp.attention_type.startswith('ntm2'): shift_width = int(hp.attention_type.split('-')[-1]) attention_mechanism = NTMAttention2(hp.attention_size, encoder_outputs, shift_width=shift_width) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) attention_cell = AttentionWrapper( dec_prenet_outputs, attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] concat_cell = ConcatOutputAndAttentionWrapper( attention_cell, embed_to_concat=speaker_embed) # Decoder (layers specified bottom to top): cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)] for _ in range(hp.dec_layer_num): cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception(" [!] Shape {} and {} should be equal". \ format(shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = \ tf.concat([tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training2 = linear_targets is not None # test에서 이게 True로 되는데, 이게 의도한 것인가??? is_training = not rnn_decoder_test_mode self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: # speaker_embedding_size = f(16) speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed( speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway" ) # 'enc_prenet_sizes': [f(256), f(128)] encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [ get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) for idx in range(hp.dec_layer_num) ] else: deep_dense = lambda x, dim: tf.layers.dense( x, dim, activation=tf.nn.softsign ) # softsign: x / (abs(x) + 1) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다. before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type)) else: # self.num_speakers =1인 경우 speaker_embed = None before_highway = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet' ) # 'enc_prenet_sizes': [f(256), f(128)], dropout_prob = 0.5 # ==> (N, T_in, 128) # enc_rnn_size = 128 encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) # single: attention_size = 128 if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_mon_norm_hccho': attention_mechanism = BahdanauMonotonicAttention_hccho( hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다. # carpedm20은 tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만, keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다. attention_cell = AttentionWrapper( GRUCell(hp.attention_state_size), attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 dec_prenet_outputs = DecoderPrenetWrapper( attention_cell, speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) # dec_prenet_sizes = [f(256), f(128)] # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다. # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ] concat_cell = ConcatOutputAndAttentionWrapper( dec_prenet_outputs, embed_to_concat=speaker_embed ) # concat(output,attention,speaker_embed)해서 새로운 output을 만든다. # Decoder (layers specified bottom to top): dec_rnn_size= 256 cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size) ] # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데... for _ in range(hp.dec_layer_num): # hp.dec_layer_num = 2 cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor ) # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까??? (hp.num_mels+1) * hp.reduction_factor decoder_init_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. ) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training2: # rnn_decoder_test_mode = True if test mode, train mode에서는 False helper = TacoTrainingHelper( inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters) # max_iters=200 # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다. ==> final_decoder_state[0] alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, inputs_jp, input_lengths, input_jp_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings # [N, T_in, embed_depth=256] # Encoder #prenet_outputs = prenet(inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( inputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # print(inputs_jp.eval) # print(inputs.eval) # print(input_jp_lengths.eval) # print(input_lengths.eval) encoder_outputs_jp = encoder_cbhg_jp( inputs_jp, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(hp.attention_depth), is_training, hp.prenet_depths), BahdanauAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Attention JP attention_cell_jp = AttentionWrapper( DecoderPrenetWrapper(GRUCell(hp.attention_depth), is_training, hp.prenet_depths), BahdanauAttention(hp.attention_depth, encoder_outputs_jp), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell_jp = ConcatOutputAndAttentionWrapper( attention_cell_jp) # [N, T_in, 2*attention_depth=512] # 以上复制一份,对应修改为日语特征输入,记新的 concat_cell为concat_cell_jp,新增一行连接两个输出 # # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. print(type(concat_cell)) print(concat_cell_jp.output_size) encoder_out = tf.concat([concat_cell, concat_cell_jp], axis=-1) #connect chinese_outputs and japanese_outputs # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(encoder_out, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hp.num_mels, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.inputs_jp = inputs_jp self.input_lengths = input_lengths self.input_jp_lengths = input_jp_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') #log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' encoder out jp: %d' % encoder_outputs_jp.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' attention out jp: %d' % attention_cell_jp.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' concat attn & out jp: %d' % concat_cell_jp.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def embedding_attention_seq2seq_context(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, fc_context_length, num_heads=1, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None): """A seq2seq architecture with two encoders, one for context, one for input DA. The decoder uses twice the cell size. Code adapted from TensorFlow examples.""" with vs.variable_scope(scope or "embedding_attention_seq2seq_context"): # split context and real inputs into separate vectors # Context input is not a sequence anymore. context_inputs = encoder_inputs[0:fc_context_length] encoder_inputs = encoder_inputs[fc_context_length:] # build separate encoders encoder_cell = EmbeddingWrapper(cell, num_encoder_symbols, embedding_size) with vs.variable_scope("context_rnn") as scope: temp = tf.reshape(context_inputs, [-1, fc_context_length]) #30]) context_states = tf.cast(tf.layers.dense(temp, units=cell.output_size), dtype=dtype) context_outputs = tf.nn.relu(context_states, name="context_output") # context_outputs, context_states = tf06s2s.rnn( # encoder_cell, context_inputs, dtype=dtype, scope=scope) with vs.variable_scope("input_rnn") as scope: encoder_outputs, encoder_states = tf06s2s.rnn(encoder_cell, encoder_inputs, dtype=dtype, scope=scope) # concatenate outputs & states # adding positional arguments and concatenating output, cell and hidden states # encoder_outputs = [array_ops.concat([co, eo], axis=1, name="context-and-encoder-output") # for co, eo in zip(context_outputs, encoder_outputs)] # encoder_states=[(array_ops.concat([c1, c2], axis=1), array_ops.concat([h1, h2], axis=1)) # for (c1, h1), (c2, h2) in zip(context_states, encoder_states)] # Add activations of FC from context as a new output encoder_outputs.insert(0, context_outputs) encoder_states.insert(0, (context_states, context_states)) # calculate a concatenation of encoder outputs to put attention on. top_states = [ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs ] #cell.output_size * 2 #added positional arguements as it was taking axis to be the values attention_states = array_ops.concat(axis=1, values=top_states) # change the decoder cell to accommodate wider input # TODO this will work for BasicLSTMCell and GRUCell, but not for others #input_size is not a field in TF 1.0.1 cell = type(cell)(num_units=(cell.output_size)) #cell.output_size * 2 # Decoder. output_size = None if output_projection is None: cell = OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return tf06s2s.embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, feed_previous) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. outputs1, states1 = tf06s2s.embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, True) vs.get_variable_scope().reuse_variables() outputs2, states2 = tf06s2s.embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, False) outputs = control_flow_ops.cond(feed_previous, lambda: outputs1, lambda: outputs2) states = control_flow_ops.cond(feed_previous, lambda: states1, lambda: states2) return outputs, states
def full_decoding(inputs, encoder_outputs, is_training, mel_targets, batch_size=32): # prenet # attention # concat attention # gru gru # output # inside cell_outputs, there's the states that contain the attentions # We make DecoderPrenetWrapper and ConcatAttentionWrapper an RNNCell so that # later on there is a helper function that can do everything for us right away cell_outputs = tf.contrib.seq2seq.AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training=is_training), tf.contrib.seq2seq.BahdanauAttention(256, encoder_outputs, normalize=True, probability_fn=tf.nn.softmax), alignment_history=True, output_attention=False) """ I learn that in AttentionWrapper, if we set alignment_history to true, we will get all the previous alignments If we set the output_attention to false, we will get an output for the cell_output instead of the attention, but the attention will be stored in the state. We do this so we can combine the output the attention. """ output_attention_cell = ConcatAttentionOutputWrapper(cell_outputs) decoding_cells = MultiRNNCell([ OutputProjectionWrapper(output_attention_cell, 256), ResidualWrapper(GRUCell(256)), ResidualWrapper(GRUCell(256)) ]) decoder_outputs = OutputProjectionWrapper(decoding_cells, hp.num_mels * hp.r_frames) decoder_initial_states = decoder_outputs.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TrainingHelper(inputs, mel_targets, hp.num_mels, hp.r_frames) else: helper = TestingHelper(batch_size=batch_size, output_dim=hp.num_mels, r=hp.r_frames) (final_decoder_outputs, sample_ids), decoder_states, _ = dynamic_decode( BasicDecoder(decoder_outputs, helper, decoder_initial_states)) mel_outputs = tf.reshape(final_decoder_outputs, shape=[batch_size, -1, hp.num_mels]) post_outputs = cbhg(mel_outputs, 16, projections=[256, hp.num_mels], scope='post_cbhg') linear_outputs = tf.layers.dense(post_outputs, hp.num_freq / 2 + 1) return mel_outputs, linear_outputs
def initialize(self, inputs, input_lengths, mel_targets=None, gta=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder enc_conv_outputs = enc_conv_layers(embedded_inputs, is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels) #Paper doesn't specify what to do with final encoder state #So we will simply drop it encoder_outputs, encoder_states = bidirectional_LSTM(enc_conv_outputs, input_lengths, 'encoder_LSTM', is_training=is_training, size=hp.encoder_lstm_units, zoneout=hp.zoneout_rate) #Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(ZoneoutLSTMCell(hp.attention_dim, is_training, #Separate LSTM for attention mechanism zoneout_factor_cell=hp.zoneout_rate, #based on original tacotron architecture zoneout_factor_output=hp.zoneout_rate), is_training), LocationSensitiveAttention(hp.attention_dim, encoder_outputs), alignment_history=True, output_attention=False, name='attention_cell') #Concat Prenet output with context vector concat_cell = ConcatPrenetAndAttentionWrapper(attention_cell) #Decoder layers (attention pre-net + 2 unidirectional LSTM Cells) decoder_cell = unidirectional_LSTM(concat_cell, is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.zoneout_rate) #Concat LSTM output with context vector concat_decoder_cell = ConcatLSTMOutputAndAttentionWrapper(decoder_cell) #Projection to mel-spectrogram dimension (times number of outputs per step) (linear transformation) output_cell = OutputProjectionWrapper(concat_decoder_cell, hp.num_mels * hp.outputs_per_step) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #We'll only limit decoder time steps during inference (consult hparams.py to modify the value) max_iterations = None if is_training else hp.max_iters #initial decoder state decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Decode (decoder_output, _), final_decoder_state, self.stop_token_loss = dynamic_decode( CustomDecoder(output_cell, self.helper, decoder_init_state), impute_finished=True, #Cut out padded parts (enabled) maximum_iterations=max_iterations) # Reshape outputs to be one output per entry decoder_output = tf.reshape(decoder_output, [batch_size, -1, hp.num_mels]) #Compute residual using post-net residual = postnet(decoder_output, is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels) #Project residual to same dimension as mel spectrogram projected_residual = projection(residual, shape=hp.num_mels, scope='residual_projection') #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.mel_outputs = mel_outputs self.mel_targets = mel_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_outputs.shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape))
def initialize(self, inputs, input_lengths, target_lengths, prefixes=None, speaker_ids=None, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # inputs # [N, T_in, D_input] speaker_embedding_table = tf.get_variable( 'speaker_embedding_table', [hp.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) speaker_embedding = tf.nn.embedding_lookup( speaker_embedding_table, speaker_ids) # [N, T_in, hp.speaker_embedding_size] deep_dense = lambda x, dim: tf.layers.dense( x, dim, activation=tf.nn.softsign) before_highway = deep_dense(speaker_embedding, 128) encoder_rnn_init_state = deep_dense(speaker_embedding, 128 * 2) attention_rnn_init_state = deep_dense(speaker_embedding, 256) decoder_rnn_init_states = [ deep_dense(speaker_embedding, 256) for _ in range(2) ] # Encoder prenet_outputs = prenet(inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state ) # [N, T_in, 256] # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs), initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(GRUCell(256)), ResidualWrapper(GRUCell(256)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) # initially, decoder_init_state is a tuple, so we firstly convert it into a list, # decoder_init_state[0] is the projection wrapper, its initial state should be zero state # finally, convert list state into tuple decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.speaker_ids = speaker_ids self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.target_lengths = target_lengths self.prefixes = prefixes log('Initialized Tacotron model. Dimensions: ') log(' inputs: %d' % inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, text_inputs, input_lengths, speaker_ids, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: text_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. speaker_ids: int32 Tensor containing ids of specific speakers mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference'): is_training = linear_targets is not None batch_size = tf.shape(text_inputs)[0] hp = self._hparams vocab_size = len(symbols) embedded_inputs = embedding(text_inputs, vocab_size, hp.embedding_dim) # [N, T_in, embd_size] # extract speaker embedding if multi-speaker with tf.variable_scope('speaker'): if hp.num_speakers > 1: speaker_embedding = tf.get_variable('speaker_embed', shape=(hp.num_speakers, hp.speaker_embed_dim), dtype=tf.float32) # TODO: what about special initializer=tf.truncated_normal_initializer(stddev=0.5)? speaker_embd = tf.nn.embedding_lookup(speaker_embedding, speaker_ids) else: speaker_embd = None # Encoder encoder_outputs = conv_and_lstm( embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, lstm_units=hp.encoder_lstm_units, is_training=is_training, scope='encoder') # [N, T_in, 512] # Attention Mechanism attention_cell = attention_decoder(encoder_outputs, hp.attention_dim, input_lengths, is_training, speaker_embd=speaker_embd, attention_type="location_sensitive") # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ attention_cell, LSTMBlockCell(hp.decoder_lstm_units), LSTMBlockCell(hp.decoder_lstm_units)], state_is_tuple=True) # [N, T_in, 1024] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(text_inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry decoder_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Postnet: predicts a residual postnet_outputs = postnet( decoder_outputs, layers=hp.postnet_conv_layers, conv_width=hp.postnet_conv_width, channels=hp.postnet_conv_channels, is_training=is_training) mel_outputs = decoder_outputs + postnet_outputs # Convert to linear using a similar architecture as the encoder: expand_outputs = conv_and_lstm( mel_outputs, None, conv_layers=hp.expand_conv_layers, conv_width=hp.expand_conv_width, conv_channels=hp.expand_conv_channels, lstm_units=hp.expand_lstm_units, is_training=is_training, scope='expand') # [N, T_in, 512] linear_outputs = tf.layers.dense(expand_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: # TODO: seems not to work?!? alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = text_inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' concat attn & out: %d' % attention_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % postnet_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder, prenet_size=[256, 128] prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_size) # [N, T_in, prenet_size[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_output_size=256] output_size=hp.encoder_output_size) # Attention_RNN 用target与encoder_output计算attention attention_cell = AttentionWrapper( # input_size = 128, output_size = 256 cell=GRUCell(num_units=hp.attention_depth ), # 输出size=attention_depth=256 # input_size = output_size = 256 attention_mechanism=BahdanauAttention( num_units=hp.attention_depth, memory=encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # attention_RNN前加入prenet, prenet_size=[256, 128], prenet_output_size=128 attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_size) # 将attention context vector和RNN cell output进行拼接 concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # DecodeRNN为2层残差RNN (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper( cell=concat_cell, output_size=hp.decoder_depth), # 512 -> 256 ResidualWrapper(cell=GRUCell(hp.decoder_depth)), ResidualWrapper(cell=GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (预测outputs_per_step帧): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) # help决定下个时刻的输入和初始输入 if is_training: helper = TacoTrainingHelper(inputs=inputs, targets=mel_targets, output_dim=hp.num_mels, r=hp.outputs_per_step) else: helper = TacoTestHelper(batch_size=batch_size, output_dim=hp.num_mels, r=hp.outputs_per_step) # 解码:预测不重叠的帧, 例如r->(r+1,2r), 2r->(2r+1,3r).... (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), # 打包成解码器 maximum_iterations=hp.max_iters) # [N, T_out/r, num_mels*r ] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M=80] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hp.num_mels, is_training, # [N, T_out, postnet_output_size=256] output_size=hp.postnet_output_size) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F=1025] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def embedding_attention_seq2seq_context(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None): """A seq2seq architecture with two encoders, one for context, one for input DA. The decoder uses twice the cell size. Code adapted from TensorFlow examples.""" with vs.variable_scope(scope or "embedding_attention_seq2seq_context"): # split context and real inputs into separate vectors context_inputs = encoder_inputs[0:old_div(len(encoder_inputs), 2)] encoder_inputs = encoder_inputs[old_div(len(encoder_inputs), 2):] # build separate encoders encoder_cell = EmbeddingWrapper(cell, num_encoder_symbols, embedding_size) with vs.variable_scope("context_rnn") as scope: context_outputs, context_states = tf06s2s.rnn(encoder_cell, context_inputs, dtype=dtype, scope=scope) with vs.variable_scope("input_rnn") as scope: encoder_outputs, encoder_states = tf06s2s.rnn(encoder_cell, encoder_inputs, dtype=dtype, scope=scope) # concatenate outputs & states # adding positional arguments and concatenating output, cell and hidden states encoder_outputs = [ array_ops.concat([co, eo], axis=1, name="context-and-encoder-output") for co, eo in zip(context_outputs, encoder_outputs) ] encoder_states = [ (array_ops.concat([c1, c2], axis=1), array_ops.concat([h1, h2], axis=1)) for (c1, h1), (c2, h2) in zip(context_states, encoder_states) ] # calculate a concatenation of encoder outputs to put attention on. top_states = [ array_ops.reshape(e, [-1, 1, cell.output_size * 2]) for e in encoder_outputs ] # added positional arguments since these swapped in some TF version attention_states = array_ops.concat(axis=1, values=top_states) # change the decoder cell to accommodate wider input # TODO this will work for BasicLSTMCell and GRUCell, but not for others cell = type(cell)(num_units=(cell.output_size * 2)) # Decoder. output_size = None if output_projection is None: cell = OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return tf06s2s.embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, feed_previous) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. outputs1, states1 = tf06s2s.embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, True) vs.get_variable_scope().reuse_variables() outputs2, states2 = tf06s2s.embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, embedding_size, num_heads, output_size, output_projection, False) outputs = control_flow_ops.cond(feed_previous, lambda: outputs1, lambda: outputs2) states = control_flow_ops.cond(feed_previous, lambda: states1, lambda: states2) return outputs, states
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, is_training=False, gta=False, eal=False, locked_alignments=None, logs_enabled=True, flag_trainAlign=False, flag_trainJoint=False, alignScale=1.0, flag_online_eal_eval=False): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. is_training: boolean flag that is set to True during training gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output ''' # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments self.flag_trainAlign = flag_trainAlign self.flag_trainJoint = flag_trainJoint self.alignScale = alignScale self.flag_online_eal = ( eal and (locked_alignments is None)) or flag_online_eal_eval if locked_alignments_ is not None: if is_training and eal: pass elif np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_cell = LockableAttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper', flag_trainAlign=self.flag_trainAlign, flag_trainJoint=self.flag_trainJoint ) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. prenet_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( prenet_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) elif eal: if self.flag_online_eal: helper_gta = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) helper_eal = TacoTrainingHelper_EAL( inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTrainingHelper_EAL(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) elif hp.scheduled_sampling: helper = TacoScheduledOutputTrainingHelper( inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step, hp.scheduled_sampling_probability) else: if is_training: log('For training, one of these should be true: gta, eal, hp.scheduled_sampling' ) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) if flag_online_eal_eval: helper_gta = helper helper_eal = helper if not self.flag_online_eal: (decoder_outputs, _ ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add post-processing CBHG: post_outputs = post_cbhg( pml_intermediates, hp.pml_dimension, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) pml_outputs = tf.layers.dense( post_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) else: (decoder_outputs, _ ), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper_gta, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add post-processing CBHG: post_outputs = post_cbhg( pml_intermediates, hp.pml_dimension, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) pml_outputs = tf.layers.dense( post_outputs, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: locked_alignments_ = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) with tf.variable_scope('inference_eal') as scope: if self.flag_online_eal: # Embeddings embedding_table_eal = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs_eal = tf.nn.embedding_lookup( embedding_table_eal, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs_eal = prenet( embedded_inputs_eal, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs_eal = encoder_cbhg( prenet_outputs_eal, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # import pdb; pdb.set_trace() # tf.get_variable_scope().reuse_variables() # Attention # tmp = None if flag_online_eal_eval else locked_alignments_ if flag_online_eal_eval: locked_alignments_ = None attention_cell_eal = LockableAttentionWrapper( GRUCell(hp.attention_depth), LocationSensitiveAttention(hp.attention_depth, encoder_outputs_eal), alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper', flag_trainAlign=self.flag_trainAlign, flag_trainJoint=self.flag_trainJoint ) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. prenet_cell_eal = DecoderPrenetWrapper(attention_cell_eal, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell_eal = ConcatOutputAndAttentionWrapper( prenet_cell_eal) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell_eal = MultiRNNCell( [ OutputProjectionWrapper(concat_cell_eal, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell_eal = OutputProjectionWrapper( decoder_cell_eal, hp.pml_dimension * hp.outputs_per_step) decoder_init_state_eal = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32) ( decoder_outputs_eal, _ ), final_decoder_state_eal, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell_eal, helper_eal, decoder_init_state_eal), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_intermediates_eal = tf.reshape( decoder_outputs_eal, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Add post-processing CBHG: post_outputs_eal = post_cbhg( pml_intermediates_eal, hp.pml_dimension, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) pml_outputs_eal = tf.layers.dense( post_outputs_eal, hp.pml_dimension) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state_eal[0].alignment_history.stack(), [1, 2, 0]) self.pml_intermediates_eal = pml_intermediates_eal self.pml_outputs_eal = pml_outputs_eal with tf.variable_scope('inference') as scope: self.inputs = inputs self.input_lengths = input_lengths self.pml_intermediates = pml_intermediates self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets self.attention_cell = attention_cell self.locked_alignments = locked_alignments_ if logs_enabled: log('Initialized Tacotron model. Dimensions: ') log(' Train mode: {}'.format(is_training)) log(' GTA mode: {}'.format(gta)) log(' EAL mode: {}'.format(eal)) log(' Embedding: {}'.format( embedded_inputs.shape[-1])) log(' Prenet out: {}'.format( prenet_outputs.shape[-1])) log(' Encoder out: {}'.format( encoder_outputs.shape[-1])) log(' Attention out: {}'.format( attention_cell.output_size)) log(' Concat attn & out: {}'.format( concat_cell.output_size)) log(' Decoder cell out: {}'.format( decoder_cell.output_size)) log(' Decoder out ({} frames): {}'.format( hp.outputs_per_step, decoder_outputs.shape[-1])) log(' Decoder out (1 frame): {}'.format( pml_intermediates.shape[-1])) log(' Postnet out: {}'.format( post_outputs.shape[-1])) log(' PML out: {}'.format( pml_outputs.shape[-1]))
def __init__(self, sess, config, api, log_dir, forward, scope=None): self.vocab = api.vocab self.rev_vocab = api.rev_vocab self.vocab_size = len(self.vocab) self.topic_vocab = api.topic_vocab self.topic_vocab_size = len(self.topic_vocab) self.da_vocab = api.dialog_act_vocab self.da_vocab_size = len(self.da_vocab) self.sess = sess self.scope = scope self.max_utt_len = config.max_utt_len self.go_id = self.rev_vocab["<s>"] self.eos_id = self.rev_vocab["</s>"] self.context_cell_size = config.cxt_cell_size self.sent_cell_size = config.sent_cell_size self.dec_cell_size = config.dec_cell_size with tf.name_scope("io"): # all dialog context and known attributes self.input_contexts = tf.placeholder(dtype=tf.int32, shape=(None, None, self.max_utt_len), name="dialog_context") self.floors = tf.placeholder(dtype=tf.int32, shape=(None, None), name="floor") self.context_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="context_lens") self.topics = tf.placeholder(dtype=tf.int32, shape=(None, ), name="topics") self.my_profile = tf.placeholder(dtype=tf.float32, shape=(None, 4), name="my_profile") self.ot_profile = tf.placeholder(dtype=tf.float32, shape=(None, 4), name="ot_profile") # target response given the dialog context self.output_tokens = tf.placeholder(dtype=tf.int32, shape=(None, None), name="output_token") self.output_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="output_lens") self.output_das = tf.placeholder(dtype=tf.int32, shape=(None, ), name="output_dialog_acts") # optimization related variables self.learning_rate = tf.Variable(float(config.init_lr), trainable=False, name="learning_rate") self.learning_rate_decay_op = self.learning_rate.assign( tf.multiply(self.learning_rate, config.lr_decay)) self.global_t = tf.placeholder(dtype=tf.int32, name="global_t") self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior") max_dialog_len = array_ops.shape(self.input_contexts)[1] max_out_len = array_ops.shape(self.output_tokens)[1] batch_size = array_ops.shape(self.input_contexts)[0] with variable_scope.variable_scope("topicEmbedding"): t_embedding = tf.get_variable( "embedding", [self.topic_vocab_size, config.topic_embed_size], dtype=tf.float32) topic_embedding = embedding_ops.embedding_lookup( t_embedding, self.topics) if config.use_hcf: with variable_scope.variable_scope("dialogActEmbedding"): d_embedding = tf.get_variable( "embedding", [self.da_vocab_size, config.da_embed_size], dtype=tf.float32) da_embedding = embedding_ops.embedding_lookup( d_embedding, self.output_das) with variable_scope.variable_scope("wordEmbedding"): self.embedding = tf.get_variable( "embedding", [self.vocab_size, config.embed_size], dtype=tf.float32) embedding_mask = tf.constant( [0 if i == 0 else 1 for i in range(self.vocab_size)], dtype=tf.float32, shape=[self.vocab_size, 1]) embedding = self.embedding * embedding_mask input_embedding = embedding_ops.embedding_lookup( embedding, tf.reshape(self.input_contexts, [-1])) input_embedding = tf.reshape( input_embedding, [-1, self.max_utt_len, config.embed_size]) output_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) if config.sent_type == "bow": input_embedding, sent_size = get_bow(input_embedding) output_embedding, _ = get_bow(output_embedding) elif config.sent_type == "rnn": sent_cell = self.get_rnncell("gru", self.sent_cell_size, config.keep_prob, 1) input_embedding, sent_size = get_rnn_encode(input_embedding, sent_cell, scope="sent_rnn") output_embedding, _ = get_rnn_encode(output_embedding, sent_cell, self.output_lens, scope="sent_rnn", reuse=True) elif config.sent_type == "bi_rnn": fwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) bwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) input_embedding, sent_size = get_bi_rnn_encode( input_embedding, fwd_sent_cell, bwd_sent_cell, scope="sent_bi_rnn") output_embedding, _ = get_bi_rnn_encode(output_embedding, fwd_sent_cell, bwd_sent_cell, self.output_lens, scope="sent_bi_rnn", reuse=True) else: raise ValueError( "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]") # reshape input into dialogs input_embedding = tf.reshape(input_embedding, [-1, max_dialog_len, sent_size]) if config.keep_prob < 1.0: input_embedding = tf.nn.dropout(input_embedding, config.keep_prob) # convert floors into 1 hot floor_one_hot = tf.one_hot(tf.reshape(self.floors, [-1]), depth=2, dtype=tf.float32) floor_one_hot = tf.reshape(floor_one_hot, [-1, max_dialog_len, 2]) joint_embedding = tf.concat([input_embedding, floor_one_hot], 2, "joint_embedding") with variable_scope.variable_scope("contextRNN"): enc_cell = self.get_rnncell(config.cell_type, self.context_cell_size, keep_prob=1.0, num_layer=config.num_layer) # and enc_last_state will be same as the true last state _, enc_last_state = tf.nn.dynamic_rnn( enc_cell, joint_embedding, dtype=tf.float32, sequence_length=self.context_lens) if config.num_layer > 1: enc_last_state = tf.concat(enc_last_state, 1) # combine with other attributes if config.use_hcf: attribute_embedding = da_embedding attribute_fc1 = layers.fully_connected(attribute_embedding, 30, activation_fn=tf.tanh, scope="attribute_fc1") cond_list = [ topic_embedding, self.my_profile, self.ot_profile, enc_last_state ] cond_embedding = tf.concat(cond_list, 1) with variable_scope.variable_scope("recognitionNetwork"): if config.use_hcf: recog_input = tf.concat( [cond_embedding, output_embedding, attribute_fc1], 1) else: recog_input = tf.concat([cond_embedding, output_embedding], 1) self.recog_mulogvar = recog_mulogvar = layers.fully_connected( recog_input, config.latent_size * 2, activation_fn=None, scope="muvar") recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=1) with variable_scope.variable_scope("priorNetwork"): # P(XYZ)=P(Z|X)P(X)P(Y|X,Z) prior_fc1 = layers.fully_connected(cond_embedding, np.maximum( config.latent_size * 2, 100), activation_fn=tf.tanh, scope="fc1") prior_mulogvar = layers.fully_connected(prior_fc1, config.latent_size * 2, activation_fn=None, scope="muvar") prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=1) # use sampled Z or posterior Z latent_sample = tf.cond( self.use_prior, lambda: sample_gaussian(prior_mu, prior_logvar), lambda: sample_gaussian(recog_mu, recog_logvar)) with variable_scope.variable_scope("generationNetwork"): gen_inputs = tf.concat([cond_embedding, latent_sample], 1) # BOW loss bow_fc1 = layers.fully_connected(gen_inputs, 400, activation_fn=tf.tanh, scope="bow_fc1") if config.keep_prob < 1.0: bow_fc1 = tf.nn.dropout(bow_fc1, config.keep_prob) self.bow_logits = layers.fully_connected(bow_fc1, self.vocab_size, activation_fn=None, scope="bow_project") # Y loss if config.use_hcf: meta_fc1 = layers.fully_connected(gen_inputs, 400, activation_fn=tf.tanh, scope="meta_fc1") if config.keep_prob < 1.0: meta_fc1 = tf.nn.dropout(meta_fc1, config.keep_prob) self.da_logits = layers.fully_connected(meta_fc1, self.da_vocab_size, scope="da_project") da_prob = tf.nn.softmax(self.da_logits) pred_attribute_embedding = tf.matmul(da_prob, d_embedding) if forward: selected_attribute_embedding = pred_attribute_embedding else: selected_attribute_embedding = attribute_embedding dec_inputs = tf.concat( [gen_inputs, selected_attribute_embedding], 1) else: self.da_logits = tf.zeros((batch_size, self.da_vocab_size)) dec_inputs = gen_inputs # Decoder if config.num_layer > 1: dec_init_state = [ layers.fully_connected(dec_inputs, self.dec_cell_size, activation_fn=None, scope="init_state-%d" % i) for i in range(config.num_layer) ] dec_init_state = tuple(dec_init_state) else: dec_init_state = layers.fully_connected(dec_inputs, self.dec_cell_size, activation_fn=None, scope="init_state") with variable_scope.variable_scope("decoder"): dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size, config.keep_prob, config.num_layer) dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size) if forward: loop_func = decoder_fn_lib.context_decoder_fn_inference( None, dec_init_state, embedding, start_of_sequence_id=self.go_id, end_of_sequence_id=self.eos_id, maximum_length=self.max_utt_len, num_decoder_symbols=self.vocab_size, context_vector=selected_attribute_embedding) dec_input_embedding = None dec_seq_lens = None else: loop_func = decoder_fn_lib.context_decoder_fn_train( dec_init_state, selected_attribute_embedding) dec_input_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) dec_input_embedding = dec_input_embedding[:, 0:-1, :] dec_seq_lens = self.output_lens - 1 if config.keep_prob < 1.0: dec_input_embedding = tf.nn.dropout( dec_input_embedding, config.keep_prob) # apply word dropping. Set dropped word to 0 if config.dec_keep_prob < 1.0: keep_mask = tf.less_equal( tf.random_uniform((batch_size, max_out_len - 1), minval=0.0, maxval=1.0), config.dec_keep_prob) keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2) dec_input_embedding = dec_input_embedding * keep_mask dec_input_embedding = tf.reshape( dec_input_embedding, [-1, max_out_len - 1, config.embed_size]) dec_outs, _, final_context_state = dynamic_rnn_decoder( dec_cell, loop_func, inputs=dec_input_embedding, sequence_length=dec_seq_lens) if final_context_state is not None: final_context_state = final_context_state[:, 0:array_ops. shape(dec_outs)[1]] mask = tf.to_int32(tf.sign(tf.reduce_max(dec_outs, axis=2))) self.dec_out_words = tf.multiply( tf.reverse(final_context_state, axis=[1]), mask) else: self.dec_out_words = tf.argmax(dec_outs, 2) if not forward: with variable_scope.variable_scope("loss"): labels = self.output_tokens[:, 1:] label_mask = tf.to_float(tf.sign(labels)) rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=dec_outs, labels=labels) rc_loss = tf.reduce_sum(rc_loss * label_mask, reduction_indices=1) self.avg_rc_loss = tf.reduce_mean(rc_loss) # used only for perpliexty calculation. Not used for optimzation self.rc_ppl = tf.exp( tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask)) """ as n-trial multimodal distribution. """ tile_bow_logits = tf.tile(tf.expand_dims(self.bow_logits, 1), [1, max_out_len - 1, 1]) bow_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tile_bow_logits, labels=labels) * label_mask bow_loss = tf.reduce_sum(bow_loss, reduction_indices=1) self.avg_bow_loss = tf.reduce_mean(bow_loss) # reconstruct the meta info about X if config.use_hcf: da_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.da_logits, labels=self.output_das) self.avg_da_loss = tf.reduce_mean(da_loss) else: self.avg_da_loss = 0.0 kld = gaussian_kld(recog_mu, recog_logvar, prior_mu, prior_logvar) self.avg_kld = tf.reduce_mean(kld) if log_dir is not None: kl_weights = tf.minimum( tf.to_float(self.global_t) / config.full_kl_step, 1.0) else: kl_weights = tf.constant(1.0) self.kl_w = kl_weights self.elbo = self.avg_rc_loss + kl_weights * self.avg_kld aug_elbo = self.avg_bow_loss + self.avg_da_loss + self.elbo tf.summary.scalar("da_loss", self.avg_da_loss) tf.summary.scalar("rc_loss", self.avg_rc_loss) tf.summary.scalar("elbo", self.elbo) tf.summary.scalar("kld", self.avg_kld) tf.summary.scalar("bow_loss", self.avg_bow_loss) self.summary_op = tf.summary.merge_all() self.log_p_z = norm_log_liklihood(latent_sample, prior_mu, prior_logvar) self.log_q_z_xy = norm_log_liklihood(latent_sample, recog_mu, recog_logvar) self.est_marginal = tf.reduce_mean(rc_loss + bow_loss - self.log_p_z + self.log_q_z_xy) self.optimize(sess, config, aug_elbo, log_dir) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=None, scope=None, initial_state_attention=False, copy=False, attn_type="linear"): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Warning: when output_projection is None, the size of the attention vectors and variables will be made proportional to num_decoder_symbols, can be large. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols: Integer; number of symbols on the decoder side. embedding_size: Integer, the length of the embedding vector for each symbol. num_heads: Number of attention heads that read from attention_states. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states. copy: If True use a copy mechanism in decoding to copy from encoder inputs attn_type: Attn type to use Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. """ with variable_scope.variable_scope(scope or "embedding_attention_seq2seq", dtype=dtype) as scope: dtype = scope.dtype # Encoder. encoder_cell = EmbeddingWrapper(cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) encoder_outputs, encoder_state = rnn.static_rnn(encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs ] attention_states = array_ops.concat(top_states, 1) # Decoder. output_size = None if output_projection is None: cell = OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols # Modify num_decoder symbols to include len of src if isinstance(feed_previous, bool): return embedding_attention_decoder( decoder_inputs, encoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous, initial_state_attention=initial_state_attention, copy=copy, attn_type=attn_type) def decoder(feed_previous_bool): reuse = None if feed_previous_bool else True with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=reuse) as scope: outputs, state = embedding_attention_decoder( decoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous_bool, update_embedding_for_previous=False, initial_state_attention=initial_state_attention) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len( decoder_inputs) # Outputs length same as decoder inputs. state_list = outputs_and_state[outputs_len:] state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as(structure=encoder_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, gta=False, locked_alignments=None, logs_enabled=True): '''Initializes the model for inference. Sets "pml_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values logs_enabled: boolean flag that defaults to True, if False no construction logs output ''' # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments if locked_alignments_ is not None: if np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: is_training = pml_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_mechanism = BahdanauAttention(hp.attention_depth, encoder_outputs) attention_cell = LockableAttentionWrapper( GRUCell(hp.attention_depth), attention_mechanism, alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper') # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. prenet_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( prenet_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r PML feature vectors (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.pml_dimension * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or gta: helper = TacoTrainingHelper(inputs, pml_targets, hp.pml_dimension, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.pml_dimension, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, P*r] # Reshape outputs to be one output per entry pml_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.pml_dimension]) # [N, T_out, P] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.pml_outputs = pml_outputs self.alignments = alignments self.pml_targets = pml_targets self.attention_cell = attention_cell if logs_enabled: log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % pml_outputs.shape[-1])
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training = linear_targets is not None self.is_randomly_initialized = is_randomly_initialized # get_variable() 사용 시, 'inference' scope 안에 있는 변수 가져옴 with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, embedding_size] char_embedded_inputs = \ tf.nn.embedding_lookup(char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup(speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed( speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway") encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \ for idx in range(hp.dec_layer_num)] else: deep_dense = lambda x, dim: \ tf.layers.dense(x, dim, activation=tf.nn.softsign) before_highway = deep_dense( speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [deep_dense( speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num)] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception(" [!] Unkown multi-speaker model type: {}".format(hp.model_type)) else: speaker_embed = None before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet(char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet') encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) dec_prenet_outputs = DecoderPrenetWrapper( GRUCell(hp.attention_state_size), speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs) elif hp.attention_type.startswith('ntm2'): shift_width = int(hp.attention_type.split('-')[-1]) attention_mechanism = NTMAttention2( hp.attention_size, encoder_outputs, shift_width=shift_width) else: raise Exception(" [!] Unkown attention type: {}".format(hp.attention_type)) attention_cell = AttentionWrapper( dec_prenet_outputs, attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False ) # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] concat_cell = ConcatOutputAndAttentionWrapper( attention_cell, embed_to_concat=speaker_embed) # Decoder (layers specified bottom to top): cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)] for _ in range(hp.dec_layer_num): cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor) decoder_init_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32) if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception(" [!] Shape {} and {} should be equal". \ format(shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper( inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) else: helper = TacoTestHelper( batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out, M] mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg( mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = \ tf.concat([tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('='*40) log(' model_type: %s' % hp.model_type) log('='*40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def __init__(self, sess, config, api, log_dir, forward, scope=None): # self.self_label = tf.placeholder(dtype=tf.bool,shape=(None), name="self_label") self.self_label = False self.vocab = api.vocab self.rev_vocab = api.rev_vocab self.vocab_size = len(self.vocab) self.sess = sess self.scope = scope self.max_utt_len = config.max_utt_len self.go_id = self.rev_vocab["<s>"] self.eos_id = self.rev_vocab["</s>"] self.context_cell_size = config.cxt_cell_size self.sent_cell_size = config.sent_cell_size self.dec_cell_size = config.dec_cell_size with tf.name_scope("io"): self.input_contexts = tf.placeholder(dtype=tf.int32, shape=(None, None), name="dialog_context") self.context_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="context_lens") self.output_tokens = tf.placeholder(dtype=tf.int32, shape=(None, None), name="output_token") self.output_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="output_lens") # optimization related variables self.learning_rate = tf.Variable(float(config.init_lr), trainable=False, name="learning_rate") self.learning_rate_decay_op = self.learning_rate.assign( tf.multiply(self.learning_rate, config.lr_decay)) self.global_t = tf.placeholder(dtype=tf.int32, name="global_t") self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior") max_input_len = array_ops.shape(self.input_contexts)[1] max_out_len = array_ops.shape(self.output_tokens)[1] batch_size = array_ops.shape(self.input_contexts)[0] with variable_scope.variable_scope("wordEmbedding"): self.embedding = tf.get_variable( "embedding", [self.vocab_size, config.embed_size], dtype=tf.float32) embedding_mask = tf.constant( [0 if i == 0 else 1 for i in range(self.vocab_size)], dtype=tf.float32, shape=[self.vocab_size, 1]) embedding = self.embedding * embedding_mask input_embedding = embedding_ops.embedding_lookup( embedding, self.input_contexts) output_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) if config.sent_type == "rnn": sent_cell = self.get_rnncell("gru", self.sent_cell_size, config.keep_prob, 1) input_embedding, sent_size = get_rnn_encode(input_embedding, sent_cell, scope="sent_rnn") output_embedding, _ = get_rnn_encode(output_embedding, sent_cell, self.output_lens, scope="sent_rnn", reuse=True) elif config.sent_type == "bi_rnn": fwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) bwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) input_embedding, sent_size = get_bi_rnn_encode( input_embedding, fwd_sent_cell, bwd_sent_cell, self.context_lens, scope="sent_bi_rnn") output_embedding, _ = get_bi_rnn_encode(output_embedding, fwd_sent_cell, bwd_sent_cell, self.output_lens, scope="sent_bi_rnn", reuse=True) else: raise ValueError( "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]") # reshape input into dialogs if config.keep_prob < 1.0: input_embedding = tf.nn.dropout(input_embedding, config.keep_prob) with variable_scope.variable_scope("contextRNN"): enc_cell = self.get_rnncell(config.cell_type, self.context_cell_size, keep_prob=1.0, num_layer=config.num_layer) # and enc_last_state will be same as the true last state input_embedding = tf.expand_dims(input_embedding, axis=2) _, enc_last_state = tf.nn.dynamic_rnn( enc_cell, input_embedding, dtype=tf.float32, sequence_length=self.context_lens) if config.num_layer > 1: if config.cell_type == 'lstm': enc_last_state = [temp.h for temp in enc_last_state] enc_last_state = tf.concat(enc_last_state, 1) else: if config.cell_type == 'lstm': enc_last_state = enc_last_state.h # input [enc_last_state, output_embedding] -- [c, x] --->z with variable_scope.variable_scope("recognitionNetwork"): recog_input = tf.concat([enc_last_state, output_embedding], 1) self.recog_mulogvar = recog_mulogvar = layers.fully_connected( recog_input, config.latent_size * 2, activation_fn=None, scope="muvar") recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=1) with variable_scope.variable_scope("priorNetwork"): # P(XYZ)=P(Z|X)P(X)P(Y|X,Z) prior_fc1 = layers.fully_connected(enc_last_state, np.maximum( config.latent_size * 2, 100), activation_fn=tf.tanh, scope="fc1") prior_mulogvar = layers.fully_connected(prior_fc1, config.latent_size * 2, activation_fn=None, scope="muvar") prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=1) # use sampled Z or posterior Z latent_sample = tf.cond( self.use_prior, lambda: sample_gaussian(prior_mu, prior_logvar), lambda: sample_gaussian(recog_mu, recog_logvar)) with variable_scope.variable_scope("label_encoder"): le_embedding_mask = tf.constant( [0 if i == 0 else 1 for i in range(self.vocab_size)], dtype=tf.float32, shape=[self.vocab_size, 1]) le_embedding = self.embedding * le_embedding_mask le_input_embedding = embedding_ops.embedding_lookup( le_embedding, self.input_contexts) le_output_embedding = embedding_ops.embedding_lookup( le_embedding, self.output_tokens) if config.sent_type == "rnn": le_sent_cell = self.get_rnncell("gru", self.sent_cell_size, config.keep_prob, 1) le_input_embedding, le_sent_size = get_rnn_encode( le_input_embedding, le_sent_cell, scope="sent_rnn") le_output_embedding, _ = get_rnn_encode(le_output_embedding, le_sent_cell, self.output_lens, scope="sent_rnn", reuse=True) elif config.sent_type == "bi_rnn": le_fwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) le_bwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) le_input_embedding, le_sent_size = get_bi_rnn_encode( le_input_embedding, le_fwd_sent_cell, le_bwd_sent_cell, self.context_lens, scope="sent_bi_rnn") le_output_embedding, _ = get_bi_rnn_encode(le_output_embedding, le_fwd_sent_cell, le_bwd_sent_cell, self.output_lens, scope="sent_bi_rnn", reuse=True) else: raise ValueError( "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]") # reshape input into dialogs if config.keep_prob < 1.0: le_input_embedding = tf.nn.dropout(le_input_embedding, config.keep_prob) # [le_enc_last_state, le_output_embedding] with variable_scope.variable_scope("lecontextRNN"): enc_cell = self.get_rnncell(config.cell_type, self.context_cell_size, keep_prob=1.0, num_layer=config.num_layer) # and enc_last_state will be same as the true last state le_input_embedding = tf.expand_dims(le_input_embedding, axis=2) _, le_enc_last_state = tf.nn.dynamic_rnn( enc_cell, le_input_embedding, dtype=tf.float32, sequence_length=self.context_lens) if config.num_layer > 1: if config.cell_type == 'lstm': le_enc_last_state = [temp.h for temp in le_enc_last_state] le_enc_last_state = tf.concat(le_enc_last_state, 1) else: if config.cell_type == 'lstm': le_enc_last_state = le_enc_last_state.h best_en = tf.concat([le_enc_last_state, le_output_embedding], 1) with variable_scope.variable_scope("ggammaNet"): enc_cell = self.get_rnncell(config.cell_type, 200, keep_prob=1.0, num_layer=config.num_layer) # and enc_last_state will be same as the true last state input_embedding = tf.expand_dims(best_en, axis=2) _, zlabel = tf.nn.dynamic_rnn(enc_cell, input_embedding, dtype=tf.float32, sequence_length=self.context_lens) if config.num_layer > 1: if config.cell_type == 'lstm': zlabel = [temp.h for temp in enc_last_state] zlabel = tf.concat(zlabel, 1) else: if config.cell_type == 'lstm': zlabel = zlabel.h with variable_scope.variable_scope("generationNetwork"): gen_inputs = tf.concat([enc_last_state, latent_sample], 1) dec_inputs = gen_inputs selected_attribute_embedding = None # Decoder_init_state if config.num_layer > 1: dec_init_state = [] for i in range(config.num_layer): temp_init = layers.fully_connected(dec_inputs, self.dec_cell_size, activation_fn=None, scope="init_state-%d" % i) if config.cell_type == 'lstm': temp_init = rnn_cell.LSTMStateTuple( temp_init, temp_init) dec_init_state.append(temp_init) dec_init_state = tuple(dec_init_state) else: dec_init_state = layers.fully_connected(dec_inputs, self.dec_cell_size, activation_fn=None, scope="init_state") if config.cell_type == 'lstm': dec_init_state = rnn_cell.LSTMStateTuple( dec_init_state, dec_init_state) with variable_scope.variable_scope("generationNetwork1"): gen_inputs_sl = tf.concat([le_enc_last_state, zlabel], 1) dec_inputs_sl = gen_inputs_sl selected_attribute_embedding = None # Decoder_init_state if config.num_layer > 1: dec_init_state_sl = [] for i in range(config.num_layer): temp_init = layers.fully_connected(dec_inputs_sl, self.dec_cell_size, activation_fn=None, scope="init_state-%d" % i) if config.cell_type == 'lstm': temp_init = rnn_cell.LSTMStateTuple( temp_init, temp_init) dec_init_state_sl.append(temp_init) dec_init_state_sl = tuple(dec_init_state_sl) else: dec_init_state_sl = layers.fully_connected(dec_inputs_sl, self.dec_cell_size, activation_fn=None, scope="init_state") if config.cell_type == 'lstm': dec_init_state_sl = rnn_cell.LSTMStateTuple( dec_init_state_sl, dec_init_state_sl) with variable_scope.variable_scope("decoder"): dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size, config.keep_prob, config.num_layer) dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size) if forward: loop_func = decoder_fn_lib.context_decoder_fn_inference( None, dec_init_state, embedding, start_of_sequence_id=self.go_id, end_of_sequence_id=self.eos_id, maximum_length=self.max_utt_len, num_decoder_symbols=self.vocab_size, context_vector=selected_attribute_embedding) loop_func_sl = decoder_fn_lib.context_decoder_fn_inference( None, dec_init_state_sl, le_embedding, start_of_sequence_id=self.go_id, end_of_sequence_id=self.eos_id, maximum_length=self.max_utt_len, num_decoder_symbols=self.vocab_size, context_vector=selected_attribute_embedding) dec_input_embedding = None dec_input_embedding_sl = None dec_seq_lens = None else: loop_func = decoder_fn_lib.context_decoder_fn_train( dec_init_state, selected_attribute_embedding) loop_func_sl = decoder_fn_lib.context_decoder_fn_train( dec_init_state_sl, selected_attribute_embedding) dec_input_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) dec_input_embedding_sl = embedding_ops.embedding_lookup( le_embedding, self.output_tokens) dec_input_embedding = dec_input_embedding[:, 0:-1, :] dec_input_embedding_sl = dec_input_embedding_sl[:, 0:-1, :] dec_seq_lens = self.output_lens - 1 if config.keep_prob < 1.0: dec_input_embedding = tf.nn.dropout( dec_input_embedding, config.keep_prob) dec_input_embedding_sl = tf.nn.dropout( dec_input_embedding_sl, config.keep_prob) # apply word dropping. Set dropped word to 0 if config.dec_keep_prob < 1.0: keep_mask = tf.less_equal( tf.random_uniform((batch_size, max_out_len - 1), minval=0.0, maxval=1.0), config.dec_keep_prob) keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2) dec_input_embedding = dec_input_embedding * keep_mask dec_input_embedding_sl = dec_input_embedding_sl * keep_mask dec_input_embedding = tf.reshape( dec_input_embedding, [-1, max_out_len - 1, config.embed_size]) dec_input_embedding_sl = tf.reshape( dec_input_embedding_sl, [-1, max_out_len - 1, config.embed_size]) dec_outs, _, final_context_state = dynamic_rnn_decoder( dec_cell, loop_func, inputs=dec_input_embedding, sequence_length=dec_seq_lens) dec_outs_sl, _, final_context_state_sl = dynamic_rnn_decoder( dec_cell, loop_func_sl, inputs=dec_input_embedding_sl, sequence_length=dec_seq_lens) if final_context_state is not None: final_context_state = final_context_state[:, 0:array_ops. shape(dec_outs)[1]] mask = tf.to_int32(tf.sign(tf.reduce_max(dec_outs, axis=2))) self.dec_out_words = tf.multiply( tf.reverse(final_context_state, axis=[1]), mask) else: self.dec_out_words = tf.argmax(dec_outs, 2) if final_context_state_sl is not None: final_context_state_sl = final_context_state_sl[:, 0:array_ops. shape( dec_outs_sl )[1]] mask_sl = tf.to_int32( tf.sign(tf.reduce_max(dec_outs_sl, axis=2))) self.dec_out_words_sl = tf.multiply( tf.reverse(final_context_state_sl, axis=[1]), mask_sl) else: self.dec_out_words_sl = tf.argmax(dec_outs_sl, 2) if not forward: with variable_scope.variable_scope("loss"): labels = self.output_tokens[:, 1:] label_mask = tf.to_float(tf.sign(labels)) rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=dec_outs, labels=labels) rc_loss = tf.reduce_sum(rc_loss * label_mask, reduction_indices=1) self.avg_rc_loss = tf.reduce_mean(rc_loss) sl_rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=dec_outs_sl, labels=labels) sl_rc_loss = tf.reduce_sum(sl_rc_loss * label_mask, reduction_indices=1) self.sl_rc_loss = tf.reduce_mean(sl_rc_loss) # used only for perpliexty calculation. Not used for optimzation self.rc_ppl = tf.exp( tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask)) """ as n-trial multimodal distribution. """ kld = gaussian_kld(recog_mu, recog_logvar, prior_mu, prior_logvar) self.avg_kld = tf.reduce_mean(kld) if log_dir is not None: kl_weights = tf.minimum( tf.to_float(self.global_t) / config.full_kl_step, 1.0) else: kl_weights = tf.constant(1.0) self.label_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=latent_sample, logits=zlabel)) self.kl_w = kl_weights self.elbo = self.avg_rc_loss + kl_weights * self.avg_kld self.cvae_loss = self.elbo + +0.1 * self.label_loss self.sl_loss = self.sl_rc_loss tf.summary.scalar("rc_loss", self.avg_rc_loss) tf.summary.scalar("elbo", self.elbo) tf.summary.scalar("kld", self.avg_kld) self.summary_op = tf.summary.merge_all() self.log_p_z = norm_log_liklihood(latent_sample, prior_mu, prior_logvar) self.log_q_z_xy = norm_log_liklihood(latent_sample, recog_mu, recog_logvar) self.est_marginal = tf.reduce_mean(rc_loss - self.log_p_z + self.log_q_z_xy) self.train_sl_ops = self.optimize(sess, config, self.sl_loss, log_dir, scope="SL") self.train_ops = self.optimize(sess, config, self.cvae_loss, log_dir, scope="CVAE") self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
def initialize(self, c_inputs, p_inputs, c_input_lengths, p_input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: c_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs p_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are phoneme IDs c_input_lengths and p_input_lenghts: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(c_inputs)[0] input_lengths = c_input_lengths+p_input_lengths #for concat character and phoneme hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # c_embedded_inputs = tf.nn.embedding_lookup(embedding_table, c_inputs) # [N, c_T_in, embed_depth=256] p_embedded_inputs = tf.nn.embedding_lookup(embedding_table, p_inputs) # [N, p_T_in, embed_depth=256] with tf.variable_scope('Encoder') as scope: c_x = c_embedded_inputs p_x = p_embedded_inputs #3 Conv Layers for i in range(3): c_x = tf.layers.conv1d(c_x,filters=512,kernel_size=5,padding='same',activation=tf.nn.relu,name='c_Encoder_{}'.format(i)) c_x = tf.layers.batch_normalization(c_x, training=is_training) c_x = tf.layers.dropout(c_x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) c_encoder_conv_output = c_x for i in range(3): p_x = tf.layers.conv1d(p_x,filters=512,kernel_size=5,padding='same',activation=tf.nn.relu,name='p_Encoder_{}'.format(i)) p_x = tf.layers.batch_normalization(p_x, training=is_training) p_x = tf.layers.dropout(p_x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) p_encoder_conv_output = p_x #bi-directional LSTM cell_fw= ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_fw_LSTM') cell_bw= ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_bw_LSTM') c_outputs, c_states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, c_encoder_conv_output, sequence_length=c_input_lengths, dtype=tf.float32) p_outputs, p_states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, p_encoder_conv_output, sequence_length=p_input_lengths, dtype=tf.float32) # c_envoder_outpust = [N,c_T,2*encoder_lstm_units] = [N,c_T,512] c_encoder_outputs = tf.concat(c_outputs, axis=2) # Concat and return forward + backward outputs # p_envoder_outpust = [N,p_T,2*encoder_lstm_units] = [N,p_T,512] p_encoder_outputs = tf.concat(p_outputs, axis=2) # Concat and return character + phoneme = [N, c_T+p_T, 512] encoder_outputs = tf.concat([c_encoder_outputs, p_encoder_outputs], axis=1) # encoder_outputs = tf.cast(encoder_outputs, tf.float32) with tf.variable_scope('Decoder') as scope: if hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention(128, encoder_outputs,hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention(128, memory=encoder_outputs, memory_sequence_length = input_lengths) elif hp.attention_type == 'step_bah': attention_mechanism = BahdanauStepwiseMonotonicAttention(128, encoder_outputs, memory_sequence_length = input_lengths, mode="parallel") elif hp.attention_type == 'mon_bah': attention_mechanism = BahdanauMonotonicAttention(128, encoder_outputs, memory_sequence_length = input_lengths, normalize=True) elif hp.attention_type == 'loung': attention_mechanism = LuongAttention(128, encoder_outputs, memory_sequence_length = input_lengths) # attention_mechanism = LocationSensitiveAttention(128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True) #mask_encoder: whether to mask encoder padding while computing location sensitive attention. Set to True for better prosody but slower convergence. #cumulate_weights: Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True) decoder_lstm = [ZoneoutLSTMCell(1024, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='decoder_LSTM_{}'.format(i+1)) for i in range(2)] decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm, state_is_tuple=True) # decoder_init_state = decoder_lstm.zero_state(batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음 attention_cell = AttentionWrapper(decoder_lstm, attention_mechanism, alignment_history=True, output_attention=False) # attention_state_size = 256 # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) dec_outputs_cell = OutputProjectionWrapper(dec_outputs,(hp.num_mels) * hp.outputs_per_step) if is_training: helper = TacoTrainingHelper(c_inputs, p_inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = dec_outputs_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(dec_outputs_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry decoder_mel_outputs = tf.reshape(decoder_outputs[:,:,:hp.num_mels * hp.outputs_per_step], [batch_size, -1, hp.num_mels]) # [N, T_out, M] #stop_token_outputs = tf.reshape(decoder_outputs[:,:,hp.num_mels * hp.outputs_per_step:], [batch_size, -1]) # [N,iters] # Postnet x = decoder_mel_outputs for i in range(5): activation = tf.nn.tanh if i != (4) else None x = tf.layers.conv1d(x,filters=512, kernel_size=5, padding='same', activation=activation, name='Postnet_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=0.5, training=is_training, name='Postnet_dropout_{}'.format(i)) residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') mel_outputs = decoder_mel_outputs + residual # Add post-processing CBHG: # mel_outputs: (N,T,num_mels) post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) # batch_size, text length(encoder), target length(decoder) self.c_inputs = c_inputs self.p_inputs = p_inputs self.c_input_lengths = c_input_lengths self.p_input_lengths = p_input_lengths self.decoder_mel_outputs = decoder_mel_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets #self.stop_token_targets = stop_token_targets #self.stop_token_outputs = stop_token_outputs self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions: ') log(' c_embedding: %d' % c_embedded_inputs.shape[-1]) log(' p_embedding: %d' % p_embedded_inputs.shape[-1]) # log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) #log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % dec_outputs_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_symbols, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None): """Embedding RNN sequence-to-sequence model with tied (shared) parameters. This model first embeds encoder_inputs by a newly created embedding (of shape [num_symbols x cell.input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs using the same embedding. Then it runs RNN decoder, initialized with the last encoder state, on embedded decoder_inputs. Args: encoder_inputs: a list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: a list of 1D int32 Tensors of shape [batch_size]. cell: RNNCell defining the cell function and size. num_symbols: integer; number of symbols for both encoder and decoder. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [cell.output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype to use for the initial RNN states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_tied_rnn_seq2seq". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when output_projection has the wrong shape. """ if output_projection is not None: proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) proj_weights.get_shape().assert_is_compatible_with( [cell.output_size, num_symbols]) proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with vs.variable_scope(scope or "embedding_tied_rnn_seq2seq"): with ops.device("/cpu:0"): embedding = vs.get_variable("embedding", [num_symbols, cell.input_size]) emb_encoder_inputs = [ embedding_ops.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ embedding_ops.embedding_lookup(embedding, x) for x in decoder_inputs ] def extract_argmax_and_embed(prev, _): """Loop_function that extracts the symbol from prev and embeds it.""" if output_projection is not None: prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1)) return embedding_ops.embedding_lookup(embedding, prev_symbol) if output_projection is None: cell = OutputProjectionWrapper(cell, num_symbols) if isinstance(feed_previous, bool): loop_function = extract_argmax_and_embed if feed_previous else None return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=loop_function, dtype=dtype) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. outputs1, states1 = tied_rnn_seq2seq( emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=extract_argmax_and_embed, dtype=dtype) vs.get_variable_scope().reuse_variables() outputs2, states2 = tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell, dtype=dtype) outputs = control_flow_ops.cond(feed_previous, lambda: outputs1, lambda: outputs2) states = control_flow_ops.cond(feed_previous, lambda: states1, lambda: states2) return outputs, states
def __init__(self, sess, config, api, log_dir, forward, scope=None): self.vocab = api.vocab self.rev_vocab = api.rev_vocab self.vocab_size = len(self.vocab) self.sess = sess self.scope = scope self.max_utt_len = config.max_utt_len self.go_id = self.rev_vocab["<s>"] self.eos_id = self.rev_vocab["</s>"] self.sent_cell_size = config.sent_cell_size self.dec_cell_size = config.dec_cell_size self.attention_dim = config.attention_dim with tf.name_scope("io"): self.input_contexts = tf.placeholder(dtype=tf.int32, shape=(None, None, self.max_utt_len), name="dialog_context") self.floors = tf.placeholder(dtype=tf.int32, shape=(None, None), name="floor") self.context_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="context_lens") self.output_tokens = tf.placeholder(dtype=tf.int32, shape=(None, None), name="output_token") self.output_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="output_lens") self.learning_rate = tf.Variable(float(config.init_lr), trainable=False, name="learning_rate") self.learning_rate_decay_op = self.learning_rate.assign( tf.multiply(self.learning_rate, config.lr_decay)) self.global_t = tf.placeholder(dtype=tf.int32, name="global_t") self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior") max_dialog_len = array_ops.shape(self.input_contexts)[1] max_out_len = array_ops.shape(self.output_tokens)[1] batch_size = array_ops.shape(self.input_contexts)[0] self.max_dialog_len = max_dialog_len with variable_scope.variable_scope("wordEmbedding"): self.embedding = tf.get_variable( "embedding", [self.vocab_size, config.embed_size], dtype=tf.float32) embedding_mask = tf.constant( [0 if i == 0 else 1 for i in range(self.vocab_size)], dtype=tf.float32, shape=[self.vocab_size, 1]) embedding = self.embedding * embedding_mask input_embedding = embedding_ops.embedding_lookup( embedding, tf.reshape(self.input_contexts, [-1])) input_embedding = tf.reshape( input_embedding, [-1, self.max_utt_len, config.embed_size]) output_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) if config.sent_type == "bow": input_embedding, sent_size = get_bow(input_embedding) output_embedding, _ = get_bow(output_embedding) elif config.sent_type == "rnn": sent_cell = self.get_rnncell("gru", self.sent_cell_size, config.keep_prob, 1) input_embedding, sent_size = get_rnn_encode(input_embedding, sent_cell, scope="sent_rnn") output_embedding, _ = get_rnn_encode(output_embedding, sent_cell, self.output_lens, scope="sent_rnn", reuse=True) elif config.sent_type == "bi_rnn": fwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) bwd_sent_cell = self.get_rnncell("gru", self.sent_cell_size, keep_prob=1.0, num_layer=1) input_embedding, sent_size = get_bi_rnn_encode( input_embedding, fwd_sent_cell, bwd_sent_cell, scope="sent_bi_rnn") output_embedding, _ = get_bi_rnn_encode(output_embedding, fwd_sent_cell, bwd_sent_cell, self.output_lens, scope="sent_bi_rnn", reuse=True) else: raise ValueError( "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]") input_embedding = tf.reshape(input_embedding, [-1, max_dialog_len, sent_size]) if config.keep_prob < 1.0: input_embedding = tf.nn.dropout(input_embedding, config.keep_prob) # convert floors into 1 hot floor_one_hot = tf.one_hot(tf.reshape(self.floors, [-1]), depth=2, dtype=tf.float32) floor_one_hot = tf.reshape(floor_one_hot, [-1, max_dialog_len, 2]) joint_embedding = tf.concat([input_embedding, floor_one_hot], 2, "joint_embedding") # [batch_size, seq_len, dim] # outputs, alphas = self.attention(joint_embedding, sent_size + 2, self.attention_dim) # [batch_size, dim] attention_embedding = tf.reduce_mean(joint_embedding, 1) with variable_scope.variable_scope("generationNetwork"): context_embedding = attention_embedding context_vector = attention_embedding if config.num_layer > 1: dec_init_state = [] for i in range(config.num_layer): temp_init = layers.fully_connected(context_embedding, self.dec_cell_size, activation_fn=None, scope="init_state-%d" % i) if config.cell_type == 'lstm': temp_init = rnn_cell.LSTMStateTuple( temp_init, temp_init) dec_init_state.append(temp_init) dec_init_state = tuple(dec_init_state) else: dec_init_state = layers.fully_connected(context_embedding, self.dec_cell_size, activation_fn=None, scope="init_state") if config.cell_type == 'lstm': dec_init_state = rnn_cell.LSTMStateTuple( dec_init_state, dec_init_state) with variable_scope.variable_scope("decoder"): dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size, config.keep_prob, config.num_layer) dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size) if forward: loop_func = decoder_fn_lib.context_decoder_fn_inference( None, dec_init_state, embedding, start_of_sequence_id=self.go_id, end_of_sequence_id=self.eos_id, maximum_length=self.max_utt_len, num_decoder_symbols=self.vocab_size, context_vector=context_vector) dec_input_embedding = None dec_seq_lens = None else: loop_func = decoder_fn_lib.context_decoder_fn_train( dec_init_state, context_vector) dec_input_embedding = embedding_ops.embedding_lookup( embedding, self.output_tokens) dec_input_embedding = dec_input_embedding[:, 0:-1, :] dec_seq_lens = self.output_lens - 1 if config.keep_prob < 1.0: dec_input_embedding = tf.nn.dropout( dec_input_embedding, config.keep_prob) # apply word dropping. Set dropped word to 0 if config.dec_keep_prob < 1.0: keep_mask = tf.less_equal( tf.random_uniform((batch_size, max_out_len - 1), minval=0.0, maxval=1.0), config.dec_keep_prob) keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2) dec_input_embedding = dec_input_embedding * keep_mask dec_input_embedding = tf.reshape( dec_input_embedding, [-1, max_out_len - 1, config.embed_size]) dec_outs, _, final_context_state = dynamic_rnn_decoder( dec_cell, loop_func, inputs=dec_input_embedding, sequence_length=dec_seq_lens) if final_context_state is not None: final_context_state = final_context_state[:, 0:array_ops. shape(dec_outs)[1]] mask = tf.to_int32(tf.sign(tf.reduce_max(dec_outs, axis=2))) self.dec_out_words = tf.multiply( tf.reverse(final_context_state, axis=[1]), mask) else: self.dec_out_words = tf.argmax(dec_outs, 2) if not forward: with variable_scope.variable_scope("loss"): labels = self.output_tokens[:, 1:] label_mask = tf.to_float(tf.sign(labels)) rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=dec_outs, labels=labels) rc_loss = tf.reduce_sum(rc_loss * label_mask, reduction_indices=1) self.avg_rc_loss = tf.reduce_mean(rc_loss) self.rc_ppl = tf.exp( tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask)) aug_elbo = self.avg_rc_loss tf.summary.scalar("rc_loss", self.avg_rc_loss) self.summary_op = tf.summary.merge_all() self.optimize(sess, config, aug_elbo, log_dir) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
def initialize(self, txt_targets, txt_lengths, mel_targets, image_targets): with tf.variable_scope('inference') as scope: is_training = mel_targets is not None is_teacher_force_generating = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings for text embedding_table = tf.get_variable( 'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_txt_inputs = tf.nn.embedding_lookup( embedding_table, txt_targets) # [N, T_in, 256] # Text Encoder prenet_outputs = prenet(embedded_txt_inputs, is_training) # [N, T_in, 128] txt_encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] self.z_txt # Speech Encoder speech_outputs = reference_encoder( mel_targets, filters=hp.reference_filters, kernel_size=(3, 3), strides=(2, 2), encoder_cell=GRUCell(hp.reference_depth), is_training=is_training) # [N, 256] self.z_speech = speech_outputs # Image Encoder img_outputs = image_encoder('E', is_training=is_training, norm='batch', image_size=128) self.z_img = img_outputs def global_body(self, input): # Global computing body (share weights) # information fusion encoder self.z_fuse = info_encoder(input) # [N, 1, 256] # Global tokens (GST) gst_tokens = tf.get_variable( 'global_tokens', [hp.num_gst, hp.embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Attention attention = MultiheadAttention( tf.expand_dims(z_fuse, axis=1), # [N, 1, 256] tf.tanh( tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1 ])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) output = attention.multi_head_attention() # [N, 1, 256] self.uni_embedding = output return self.uni_embedding # Domain classification network domain_logit_txt = domain_classifier('D', is_training=is_training, norm='batch', info_encoder(self.z_txt)) domain_logit_img = domain_classifier('D', is_training=is_training, norm='batch', info_encoder(self.z_img)) domain_logit_speech = domain_classifier('D', is_training=is_training, norm='batch', info_encoder( self.z_speech)) # out of inference scope # Add style embedding to every text encoder state # Text Decoder scope with tf.variable_scope('text_decoder') as scope: attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, uni_embeddings, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.rnn_depth), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] output_cell = OutputProjectionWrapper(decoder_cell, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) decoder_outputs, _ = tf.nn.dynamic_rnn( cell=output_cell, initial_state=decoder_init_state, maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] with tf.variable_scope('text_logits') as scope: txt_logit = tf.contrib.layers.fully_connected( inputs=decoder_outputs, num_outputs=self.config.vocab_size, activation_fn=None, weights_initializer=self.initializer, scope=logits_scope) # Image Decoder scope with tf.variable_scope('image_decoder') as scope: G = Generator('G', is_train=self.is_training, norm='batch', image_size=128) fake_img = G(uni_embeddings) # Speech Decoder scope with tf.variable_scope('speech_decoder') as scope: # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, uni_embeddings, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.rnn_depth), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry fake_mel = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] self.txt_targets = txt_targets self.txt_lengths = txt_lengths self.mel_targets = mel_targets self.image_targets = image_targets self.txt_targets = txt_targets self.txt_logit = txt_logit self.fake_mel = fake_mel self.fake_img = fake_img