def __init__(self, inp, inp_mask, decode_time_steps, ctr_flag, ctr_attention, hyper_params=None, name='Tacotron'): """ Build the computational graph. :param inp: :param inp_mask: :param decode_time_steps: :param hyper_params: :param name: """ super(Tacotron, self).__init__(name) self.hyper_params = HyperParams() if hyper_params is None else hyper_params with tf.variable_scope(name): self.global_step = tf.Variable(0, name='global_step', trainable=False) batch_size = tf.shape(inp)[0] input_time_steps = tf.shape(inp)[1] reduc = self.hyper_params.reduction_rate output_time_steps = decode_time_steps * reduc ### Encoder [begin] with tf.variable_scope('character_embedding'): embed_inp = EmbeddingLayer(self.hyper_params.embed_class, self.hyper_params.embed_dim)(inp) with tf.variable_scope("changeToVarible"): self.single_style_token = tf.get_variable('style_token', (1, self.hyper_params.styles_kind, self.hyper_params.style_dim), dtype=tf.float32) self.style_token = tf.tile(self.single_style_token, (batch_size, 1, 1)) with tf.variable_scope('encoder_pre_net'): pre_ed_inp = tf.layers.dropout(tf.layers.dense(embed_inp, 256, tf.nn.relu), training=False) pre_ed_inp = tf.layers.dropout(tf.layers.dense(pre_ed_inp, 128, tf.nn.relu), training=False) encoder_output = modules.cbhg(pre_ed_inp, training=False, k=16, bank_filters=128, projection_filters=(128, 128), highway_layers=4, highway_units=128, bi_gru_units=128, sequence_length=inp_mask, name='encoder_cbhg', reuse=False) with tf.variable_scope('post_text'): all_outputs, _ = tf.nn.dynamic_rnn(cell=GRUCell(256), inputs=encoder_output, sequence_length=inp_mask, dtype=encoder_output.dtype, parallel_iterations=unkonwn_parallel_iterations) all_outputs = tf.transpose(all_outputs, [1, 0, 2]) static_encoder_output = all_outputs[-1] ### Encoder [end] ### Attention Module with tf.variable_scope('attention'): att_module = AttentionModule(256, encoder_output, sequence_length=inp_mask, time_major=False) with tf.variable_scope("attention_style"): att_module_style = AttentionModule(256, self.style_token, time_major=False) ### Decoder [begin] att_cell = GRUCell(256) dec_cell = MultiRNNCell([ResidualWrapper(GRUCell(256)) for _ in range(2)]) # prepare output alpha TensorArray with tf.variable_scope('prepare_decode'): # prepare output alpha TensorArray reduced_time_steps = tf.div(output_time_steps, reduc) init_att_cell_state = att_cell.zero_state(batch_size, tf.float32) init_dec_cell_state = dec_cell.zero_state(batch_size, tf.float32) init_state_tup = tuple([init_att_cell_state, init_dec_cell_state]) init_output_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) init_alpha_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) init_weight_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) init_weight_per_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) init_alpha_style_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) go_array = tf.zeros([batch_size, self.hyper_params.seq2seq_dim], dtype=tf.float32) init_context = tf.zeros([batch_size, 256], dtype=tf.float32) init_context_style = tf.zeros([batch_size, 256], dtype=tf.float32) init_time = tf.constant(0, dtype=tf.int32) cond = lambda x, *_: tf.less(x, reduced_time_steps) def body(this_time, old_output_ta, old_alpha_ta, old_alpha_style_ta, old_weight_ta, old_weight_per_ta, old_state_tup, last_context, last_context_style, last_output): with tf.variable_scope('decoder_pre_net'): dec_pre_ed_inp = last_output dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense(dec_pre_ed_inp, 256, tf.nn.relu), training=False) dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense(dec_pre_ed_inp, 128, tf.nn.relu), training=False) with tf.variable_scope('attention_rnn'): # dec_pre_ed_inp = tf.Print(dec_pre_ed_inp, [dec_pre_ed_inp[0]], message='dec', summarize=10) att_cell_inp = tf.concat([last_context, dec_pre_ed_inp], axis=-1) att_cell_out, att_cell_state = att_cell(att_cell_inp, old_state_tup[0]) with tf.variable_scope('attention'): query = att_cell_state[0] context, alpha = att_module(query) new_alpha_ta = old_alpha_ta.write(this_time, alpha) with tf.variable_scope("attention_style"): query_style = att_cell_state[0] context_style, alpha_style = att_module_style(query_style) alpha_style = tf.cond(tf.equal(ctr_flag, 1), lambda: ctr_attention, lambda: alpha_style) alpha_style = tf.Print(alpha_style, [alpha_style], message='alpha:', summarize=10) context_style = tf.cond(tf.equal(ctr_flag, 1), lambda: tf.reduce_sum(tf.expand_dims(alpha_style, axis=-1) * self.style_token, axis=1), lambda: context_style) context_style = tf.Print(context_style, [context_style], message='style:', summarize=10) # alpha_style = ctr_attention # alpha_style = tf.Print(alpha_style, [alpha_style], message='alpha', summarize=20) # context_style = tf.reduce_sum(tf.expand_dims(alpha_style, axis=-1) * self.style_token, axis=1) # context_style = tf.Print(context_style, [context_style], message='ctxt_style', summarize=20) new_alpha_style_ta = old_alpha_style_ta.write(this_time, alpha_style) with tf.variable_scope("weighting"): weight_input = tf.concat([static_encoder_output, dec_pre_ed_inp], axis=-1) weighting = tf.layers.dense(weight_input, 2, tf.nn.sigmoid) # weighting = tf.Print(weighting, [weighting[1]], message='weighting') weighting = tf.nn.softmax(weighting) weight_text, weight_style = tf.split(weighting, [1, 1], -1) # weight_text = tf.Print(weight_text, [weight_text], message='weight_text:', summarize=20) weight_style = tf.Print(weight_style, [weight_style], message='weight_style:') new_weight_ta = old_weight_ta.write(this_time, weight_text) with tf.variable_scope('decoder_rnn'): weighting_context = weight_text * context + weight_style * context_style weight_per = tf.reduce_mean(tf.abs(weight_style * context_style) / ( tf.abs(weight_text * context) + tf.abs(weight_style * context_style))) new_weight_per_ta = old_weight_per_ta.write(this_time, weight_per) dec_input = tf.layers.dense(tf.concat([att_cell_out, weighting_context], axis=-1), 256) # dec_input = tf.layers.dense(tf.concat([att_cell_out, context], axis=-1), 256) dec_cell_out, dec_cell_state = dec_cell(dec_input, old_state_tup[1]) dense_out = tf.layers.dense(dec_cell_out, self.hyper_params.seq2seq_dim * reduc) new_output_ta = old_output_ta.write(this_time, dense_out) new_output = dense_out[:, -self.hyper_params.seq2seq_dim:] new_state_tup = tuple([att_cell_state, dec_cell_state]) return tf.add(this_time, 1), new_output_ta, new_alpha_ta, new_alpha_style_ta, new_weight_ta,\ new_weight_per_ta, new_state_tup, context, context_style, new_output # run loop _, seq2seq_output_ta, alpha_ta, alpha_style_ta, weight_ta, weight_per_ta, *_ = tf.while_loop(cond, body, [init_time, init_output_ta, init_alpha_ta, init_alpha_style_ta, init_weight_ta, init_weight_per_ta, init_state_tup, init_context, init_context_style, go_array ]) with tf.variable_scope('reshape_decode'): seq2seq_output = tf.reshape(seq2seq_output_ta.stack(), shape=(reduced_time_steps, batch_size, self.hyper_params.seq2seq_dim * reduc)) seq2seq_output = tf.reshape(tf.transpose(seq2seq_output, perm=(1, 0, 2)), shape=(batch_size, output_time_steps, self.hyper_params.seq2seq_dim)) self.seq2seq_output = seq2seq_output alpha_output = tf.reshape(alpha_ta.stack(), shape=(reduced_time_steps, batch_size, input_time_steps)) alpha_output = tf.expand_dims(tf.transpose(alpha_output, perm=(1, 0, 2)), -1) self.alpha_output = alpha_output alpha_output_style = tf.reshape(alpha_style_ta.stack(), shape=(reduced_time_steps, batch_size, self.hyper_params.styles_kind)) alpha_output_style = tf.expand_dims(tf.transpose(alpha_output_style, perm=(1, 0, 2)), -1) # batch major self.alpha_output_style = alpha_output_style weight_ta = tf.reshape(weight_ta.stack(), shape=(reduced_time_steps, batch_size, 1)) weight_ta = tf.transpose(weight_ta, perm=(1, 0, 2)) self.weight_ta = weight_ta weight_per_ta = tf.reshape(weight_per_ta.stack(), shape=(reduced_time_steps, 1)) self.weight_per_ta = weight_per_ta ### Decoder [end] ### PostNet [begin] post_output = modules.cbhg(seq2seq_output, training=False, k=8, bank_filters=128, projection_filters=(256, self.hyper_params.seq2seq_dim), highway_layers=4, highway_units=128, bi_gru_units=128, sequence_length=None, name='decoder_cbhg', reuse=False) post_output = tf.layers.dense(post_output, self.hyper_params.post_dim, name='post_linear_transform') self.post_output = post_output
def __init__(self, inp, inp_mask, seq2seq_gtruth, post_gtruth, hyper_params=None, training=True, name='Tacotron', reuse=False): """ Build the computational graph. :param inp: :param inp_mask: :param seq2seq_gtruth: :param post_gtruth: :param hyper_params: :param training: :param name: """ super(Tacotron, self).__init__(name) self.hyper_params = HyperParams( ) if hyper_params is None else hyper_params with tf.variable_scope(name, reuse=reuse): self.global_step = tf.Variable(0, name='global_step', trainable=False) self.learning_rate = tf.Variable( self.hyper_params.learning_rate[0], name='learning_rate', trainable=False, dtype=tf.float32) batch_size = tf.shape(inp)[0] input_time_steps = tf.shape(inp)[1] output_time_steps = tf.shape(seq2seq_gtruth)[1] ### Encoder [begin] with tf.variable_scope('character_embedding'): embed_inp = EmbeddingLayer(self.hyper_params.embed_class, self.hyper_params.embed_dim)(inp) with tf.variable_scope("changeToVarible"): self.single_style_token = tf.get_variable( 'style_token', (1, self.hyper_params.styles_kind, self.hyper_params.style_dim), dtype=tf.float32) self.style_token = tf.tile(self.single_style_token, (batch_size, 1, 1)) with tf.variable_scope('encoder_pre_net'): pre_ed_inp = tf.layers.dropout(tf.layers.dense( embed_inp, 256, tf.nn.relu), training=training) pre_ed_inp = tf.layers.dropout(tf.layers.dense( pre_ed_inp, 128, tf.nn.relu), training=training) encoder_output = modules.cbhg(pre_ed_inp, training=training, k=16, bank_filters=128, projection_filters=(128, 128), highway_layers=4, highway_units=128, bi_gru_units=128, sequence_length=inp_mask, name='encoder_cbhg', reuse=False) ### Encoder [end] ### Attention Module with tf.variable_scope('attention'): att_module = AttentionModule(256, encoder_output, sequence_length=inp_mask, time_major=False) with tf.variable_scope("attention_style"): att_module_style = AttentionModule(256, self.style_token, time_major=False) ### Decoder [begin] att_cell = GRUCell(256) dec_cell = MultiRNNCell( [ResidualWrapper(GRUCell(256)) for _ in range(2)]) # prepare output alpha TensorArray with tf.variable_scope('prepare_decode'): reduc = self.hyper_params.reduction_rate reduced_time_steps = tf.div(output_time_steps, reduc) init_att_cell_state = att_cell.zero_state( batch_size, tf.float32) init_dec_cell_state = dec_cell.zero_state( batch_size, tf.float32) init_state_tup = tuple( [init_att_cell_state, init_dec_cell_state]) init_output_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) init_alpha_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) init_weight_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) init_weight_per_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) init_alpha_style_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) time_major_seq2seq_gtruth = tf.transpose(seq2seq_gtruth, perm=(1, 0, 2)) indic_array = tf.concat([ tf.zeros([ reduc, batch_size, self.hyper_params.seq2seq_dim ]), time_major_seq2seq_gtruth ], axis=0) init_context = tf.zeros([batch_size, 256], dtype=tf.float32) init_context_style = tf.zeros([batch_size, 256], dtype=tf.float32) init_time = tf.constant(0, dtype=tf.int32) cond = lambda x, *_: tf.less(x, reduced_time_steps) def body(this_time, old_context, old_context_style, old_output_ta, old_alpha_ta, old_alpha_style_ta, old_weight_ta, old_weight_per_ta, old_state_tup): with tf.variable_scope('decoder_pre_net'): dec_pre_ed_inp = indic_array[reduc * this_time + reduc - 1] dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense( dec_pre_ed_inp, 256, tf.nn.relu), training=training) dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense( dec_pre_ed_inp, 128, tf.nn.relu), training=training) with tf.variable_scope('attention_rnn'): att_cell_inp = tf.concat( [old_context, old_context_style, dec_pre_ed_inp], axis=-1) att_cell_out, att_cell_state = att_cell( att_cell_inp, old_state_tup[0]) with tf.variable_scope('attention'): query = att_cell_state[0] context, alpha = att_module(query) new_alpha_ta = old_alpha_ta.write(this_time, alpha) with tf.variable_scope("attention_style"): query_style = att_cell_state[0] context_style, alpha_style = att_module_style(query_style) new_alpha_style_ta = old_alpha_style_ta.write( this_time, alpha_style) with tf.variable_scope("weighting"): weighting = tf.layers.dense(dec_pre_ed_inp, 1, tf.nn.sigmoid) # weighting = tf.nn.softmax(weighting) new_weight_ta = old_weight_ta.write(this_time, weighting) with tf.variable_scope('decoder_rnn'): weighting_context = weighting * context + ( 1 - weighting) * context_style weight_per = tf.reduce_mean( tf.abs((1 - weighting) * context_style) / (tf.abs(weighting * context) + tf.abs( (1 - weighting) * context_style))) new_weight_per_ta = old_weight_per_ta.write( this_time, weight_per) dec_input = tf.layers.dense( tf.concat([att_cell_out, weighting_context], axis=-1), 256) dec_cell_out, dec_cell_state = dec_cell( dec_input, old_state_tup[1]) dense_out = tf.layers.dense( dec_cell_out, self.hyper_params.seq2seq_dim * reduc) new_output_ta = old_output_ta.write(this_time, dense_out) new_state_tup = tuple([att_cell_state, dec_cell_state]) return tf.add( this_time, 1 ), context, context_style, new_output_ta, new_alpha_ta, new_alpha_style_ta, new_weight_ta, new_weight_per_ta, new_state_tup # run loop _, _, _, seq2seq_output_ta, alpha_ta, alpha_style_ta, weight_ta, weight_per_ta, *_ = tf.while_loop( cond, body, [ init_time, init_context, init_context_style, init_output_ta, init_alpha_ta, init_alpha_style_ta, init_weight_ta, init_weight_per_ta, init_state_tup ], parallel_iterations=32) with tf.variable_scope('reshape_decode'): seq2seq_output = tf.reshape( seq2seq_output_ta.stack(), shape=(reduced_time_steps, batch_size, self.hyper_params.seq2seq_dim * reduc)) seq2seq_output = tf.reshape( tf.transpose(seq2seq_output, perm=(1, 0, 2)), shape=(batch_size, output_time_steps, self.hyper_params.seq2seq_dim)) self.seq2seq_output = seq2seq_output alpha_output = tf.reshape(alpha_ta.stack(), shape=(reduced_time_steps, batch_size, input_time_steps)) alpha_output = tf.expand_dims( tf.transpose(alpha_output, perm=(1, 0, 2)), -1) self.alpha_output = alpha_output alpha_output_style = tf.reshape( alpha_style_ta.stack(), shape=(reduced_time_steps, batch_size, self.hyper_params.styles_kind)) alpha_output_style = tf.expand_dims( tf.transpose(alpha_output_style, perm=(1, 0, 2)), -1) # batch major self.alpha_output_style = alpha_output_style weight_ta = tf.reshape(weight_ta.stack(), shape=(reduced_time_steps, batch_size, 1)) weight_ta = tf.transpose(weight_ta, perm=(1, 0, 2)) self.weight_ta = weight_ta weight_per_ta = tf.reshape(weight_per_ta.stack(), shape=(reduced_time_steps, 1)) self.weight_per_ta = weight_per_ta ### Decoder [end] ### PostNet [begin] post_output = modules.cbhg( seq2seq_output, training=training, k=8, bank_filters=128, projection_filters=(256, self.hyper_params.seq2seq_dim), highway_layers=4, highway_units=128, bi_gru_units=128, sequence_length=None, name='decoder_cbhg', reuse=False) post_output = tf.layers.dense(post_output, self.hyper_params.post_dim, name='post_linear_transform') self.post_output = post_output ### PostNet [end] ### Loss with tf.variable_scope('loss'): self.seq2seq_loss = l1_loss(seq2seq_gtruth, seq2seq_output) self.post_loss = l1_loss(post_gtruth, post_output) self.loss = self.seq2seq_loss + self.post_loss
def build(self, inp, inp_mask, mel_gtruth, spec_gtruth): batch_size = tf.shape(inp)[0] input_time_steps = tf.shape(inp)[1] output_time_steps = tf.shape(mel_gtruth)[1] ### Encoder [ begin with tf.variable_scope("encoder"): with tf.variable_scope("embedding"): embed_inp = EmbeddingLayer(EMBED_CLASS, EMBED_DIM)(inp) with tf.variable_scope("changeToVarible"): self.single_style_token = tf.get_variable( 'style_token', (1, styles_kind, style_dim), dtype=tf.float32) self.style_token = tf.tile(self.single_style_token, (batch_size, 1, 1)) with tf.variable_scope("pre-net"): pre_ed_inp = tf.layers.dropout(tf.layers.dense( embed_inp, 256, tf.nn.relu), training=self.training) pre_ed_inp = tf.layers.dropout(tf.layers.dense( pre_ed_inp, 128, tf.nn.relu), training=self.training) with tf.variable_scope("CBHG"): # batch major encoder_output = CBHG(16, (128, 128))(pre_ed_inp, sequence_length=inp_mask, is_training=self.training, time_major=False) with tf.variable_scope("attention"): att_module = AttentionModule(ATT_RNN_SIZE, encoder_output, sequence_length=inp_mask, time_major=False) with tf.variable_scope("attention_style"): att_module_style = AttentionModule(STYLE_ATT_RNN_SIZE, self.style_token, time_major=False) with tf.variable_scope("decoder"): with tf.variable_scope("attentionRnn"): att_cell = GRUCell(ATT_RNN_SIZE) with tf.variable_scope("acoustic_module"): aco_cell = MultiRNNCell( [ResidualWrapper(GRUCell(DEC_RNN_SIZE)) for _ in range(2)]) ### prepare output alpha TensorArray reduced_time_steps = tf.div(output_time_steps, self.r) att_cell_state = att_cell.init_state(batch_size, tf.float32) aco_cell_state = aco_cell.zero_state(batch_size, tf.float32) state_tup = tuple([att_cell_state, aco_cell_state]) output_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) alpha_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) weight_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) alpha_style_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) indic_ta = tf.TensorArray(size=self.r + output_time_steps, dtype=tf.float32) time_major_mel_gtruth = tf.transpose(mel_gtruth, perm=(1, 0, 2)) indic_array = tf.concat([ tf.zeros([self.r, batch_size, OUTPUT_MEL_DIM]), time_major_mel_gtruth ], axis=0) indic_ta = indic_ta.unstack(indic_array) #init_context = tf.zeros((batch_size, 256)) time = tf.constant(0, dtype=tf.int32) cond = lambda time, *_: tf.less(time, reduced_time_steps) def body(time, output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup): with tf.variable_scope("att-rnn"): pre_ed_indic = tf.layers.dropout(tf.layers.dense( indic_ta.read(self.r * time + self.r - 1), 256, tf.nn.relu), training=self.training) pre_ed_indic = tf.layers.dropout(tf.layers.dense( pre_ed_indic, 128, tf.nn.relu), training=self.training) att_cell_out, att_cell_state = att_cell( tf.concat([pre_ed_indic], axis=-1), state_tup[0]) with tf.variable_scope("attention"): query = att_cell_state[0] # att_cell_out context, alpha = att_module(query) alpha_ta = alpha_ta.write(time, alpha) with tf.variable_scope("attention_style"): context_style, alpha_style = att_module_style(query) alpha_style_ta = alpha_style_ta.write(time, alpha_style) with tf.variable_scope("weighting"): print(query) weighting = add_layer(query, query.shape[-1], 1, 'weighting_w', 'weighting_b', activation_function=tf.nn.sigmoid) # weighting = tf.nn.softmax(weighting) weight_ta = weight_ta.write(time, weighting) with tf.variable_scope("acoustic_module"): # weighting0 = tf.reshape(weighting[:, 0], (BATCH_SIZE, 1)) # weighting1 = tf.reshape(weighting[:, 1], (BATCH_SIZE, 1)) weighting_context = context + weighting * tf.nn.tanh( context_style) # print(weighting_context) aco_input = tf.layers.dense( tf.concat([att_cell_out, weighting_context], axis=-1), DEC_RNN_SIZE) aco_cell_out, aco_cell_state = aco_cell( aco_input, state_tup[1]) dense_out = tf.layers.dense(aco_cell_out, OUTPUT_MEL_DIM * self.r) output_ta = output_ta.write(time, dense_out) state_tup = tuple([att_cell_state, aco_cell_state]) return tf.add( time, 1 ), output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup ### run loop _, output_mel_ta, final_alpha_ta, final_alpha_style_ta, final_weight_ta, *_ = tf.while_loop( cond, body, [ time, output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup ]) # print('hjhhhh', reduced_time_steps, batch_size, OUTPUT_MEL_DIM * self.r, batch_size, output_time_steps, # OUTPUT_MEL_DIM) # sys.stdout.flush() ### time major with tf.variable_scope("output"): # print('hjhhhh', reduced_time_steps, batch_size, OUTPUT_MEL_DIM * self.r, batch_size, output_time_steps, OUTPUT_MEL_DIM) # sys.stdout.flush() output_mel = tf.reshape(output_mel_ta.stack(), shape=(reduced_time_steps, batch_size, OUTPUT_MEL_DIM * self.r)) output_mel = tf.reshape(tf.transpose(output_mel, perm=(1, 0, 2)), shape=(batch_size, output_time_steps, OUTPUT_MEL_DIM)) self.out_mel = output_mel with tf.variable_scope("post-net"): output_post = CBHG(8, (256, OUTPUT_MEL_DIM))( output_mel, sequence_length=None, is_training=self.training, time_major=False) output_spec = tf.layers.dense(output_post, OUTPUT_SPEC_DIM) self.out_stftm = output_spec final_alpha = tf.reshape(final_alpha_ta.stack(), shape=(reduced_time_steps, batch_size, input_time_steps)) final_alpha = tf.transpose(final_alpha, perm=(1, 0, 2)) # batch major final_alpha_style = tf.reshape(final_alpha_style_ta.stack(), shape=(reduced_time_steps, batch_size, styles_kind)) final_alpha_style = tf.transpose(final_alpha_style, perm=(1, 0, 2)) # batch major final_weight_ta = tf.reshape(final_weight_ta.stack(), shape=(reduced_time_steps, batch_size, 1)) final_weight_ta = tf.transpose(final_weight_ta, perm=(1, 0, 2)) # batch major self.weighting = final_weight_ta # self.alpha_style_hjk_img = tf.reshape(final_alpha_style, shape=(batch_size, reduced_time_steps, styles_kind)) with tf.variable_scope("loss_and_metric"): self.loss_mel = tf.reduce_mean(tf.abs(mel_gtruth - output_mel)) self.loss_spec = tf.reduce_mean(tf.abs(spec_gtruth - output_spec)) self.loss = self.loss_mel + self.loss_spec self.alpha_img = tf.expand_dims(final_alpha, -1) self.alpha_style_img = tf.expand_dims(final_alpha_style, -1) self.weight_img = tf.expand_dims(final_weight_ta, -1) self.sums = [] self.sums.append( tf.summary.image("train/alpha", self.alpha_img[:2])) self.sums.append( tf.summary.image("train/alpha_style", self.alpha_style_img[:2])) self.sums.append( tf.summary.image("train/weight", self.weight_img[:2])) self.sums.append(tf.summary.scalar("train/loss", self.loss)) self.sums.append( tf.summary.scalar("train/style_0_0", self.single_style_token[0][0][0])) self.sums.append( tf.summary.scalar("train/style_0_100", self.single_style_token[0][0][100])) self.sums.append( tf.summary.scalar("train/style_5_100", self.single_style_token[0][5][100])) self.sums.append( tf.summary.histogram("train/style_vec", self.single_style_token)) self.pred_audio_holder = tf.placeholder(shape=(None, None), dtype=tf.float32, name='pred_audio') self.pred_audio_summary = tf.summary.audio('pred_audio_summary', self.pred_audio_holder, sample_rate=sr, max_outputs=12)
def build(self, inp, inp_mask): batch_size = tf.shape(inp)[0] input_time_steps = tf.shape(inp)[1] ### Encoder [ begin ] with tf.variable_scope("encoder"): with tf.variable_scope("embedding"): embed_inp = EmbeddingLayer(EMBED_CLASS, EMBED_DIM)(inp) with tf.variable_scope("changeToVarible"): self.single_style_token = tf.get_variable( 'style_token', (1, styles_kind, style_dim), dtype=tf.float32) self.style_token = tf.tile(self.single_style_token, (batch_size, 1, 1)) with tf.variable_scope("pre-net"): pre_ed_inp = tf.layers.dropout(tf.layers.dense( embed_inp, 256, tf.nn.relu), training=self.training) pre_ed_inp = tf.layers.dropout(tf.layers.dense( pre_ed_inp, 128, tf.nn.relu), training=self.training) with tf.variable_scope("CBHG"): # batch major encoder_output = CBHG(16, (128, 128))(pre_ed_inp, sequence_length=inp_mask, is_training=self.training, time_major=False) with tf.variable_scope("attention"): att_module = AttentionModule(ATT_RNN_SIZE, encoder_output, sequence_length=inp_mask, time_major=False) with tf.variable_scope("attention_style"): att_module_style = AttentionModule(STYLE_ATT_RNN_SIZE, self.style_token, time_major=False) with tf.variable_scope("decoder"): with tf.variable_scope("attentionRnn"): att_cell = GRUCell(ATT_RNN_SIZE) with tf.variable_scope("acoustic_module"): aco_cell = MultiRNNCell( [ResidualWrapper(GRUCell(DEC_RNN_SIZE)) for _ in range(2)]) ### prepare output alpha TensorArray reduced_time_steps = tf.div(MAX_OUT_STEPS, self.r) att_cell_state = att_cell.init_state(batch_size, tf.float32) aco_cell_state = aco_cell.zero_state(batch_size, tf.float32) state_tup = tuple([att_cell_state, aco_cell_state]) output_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) alpha_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) weight_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) alpha_style_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) init_indic = tf.zeros([batch_size, OUTPUT_MEL_DIM]) # init_context = tf.zeros((batch_size, 256)) time = tf.constant(0, dtype=tf.int32) cond = lambda time, *_: tf.less(time, reduced_time_steps) def body(time, indic, output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup): with tf.variable_scope("att-rnn"): pre_ed_indic = tf.layers.dropout(tf.layers.dense( indic, 256, tf.nn.relu), training=self.training) pre_ed_indic = tf.layers.dropout(tf.layers.dense( pre_ed_indic, 128, tf.nn.relu), training=self.training) att_cell_out, att_cell_state = att_cell( tf.concat([pre_ed_indic], axis=-1), state_tup[0]) with tf.variable_scope("attention"): query = att_cell_state[0] # att_cell_out context, alpha = att_module(query) alpha_ta = alpha_ta.write(time, alpha) with tf.variable_scope("attention_style"): context_style, alpha_style = att_module_style(query) print('context_style:', context_style) # print('context_style22:', alpha_style) alpha_style_ta = alpha_style_ta.write(time, alpha_style) with tf.variable_scope("weighting"): weighting = add_layer(query, query.shape[-1], 1, 'weighting_w', 'weighting_b', activation_function=tf.nn.sigmoid) # weighting = tf.nn.softmax(weighting) weight_ta = weight_ta.write(time, weighting) with tf.variable_scope("acoustic_module"): # weighting0 = tf.reshape(weighting[:, 0], (BATCH_SIZE, 1)) # weighting1 = tf.reshape(weighting[:, 1], (BATCH_SIZE, 1)) # weighting_context = weighting0 * context + weighting1 * context_style # print('context:', context) weighting = tf.Print(weighting, [weighting], message='weight', summarize=100) context_style = tf.Print(context_style, [context_style[0][0:5]], message='origal_style', summarize=100) context_style = tf.Print( context_style, [tf.nn.tanh(context_style)[0][0:5]], message='tanh_style', summarize=100) context = tf.Print(context, [context[0][0:5]], message='context', summarize=100) weighting_context = context + weighting * tf.nn.tanh( context_style) aco_input = tf.layers.dense( tf.concat([att_cell_out, weighting_context], axis=-1), DEC_RNN_SIZE) aco_cell_out, aco_cell_state = aco_cell( aco_input, state_tup[1]) dense_out = tf.reshape( tf.layers.dense(aco_cell_out, OUTPUT_MEL_DIM * self.r), shape=(batch_size, self.r, OUTPUT_MEL_DIM)) output_ta = output_ta.write(time, dense_out) new_indic = dense_out[:, -1] state_tup = tuple([att_cell_state, aco_cell_state]) return tf.add( time, 1 ), new_indic, output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup ### run loop _, _, output_mel_ta, final_alpha_ta, final_alpha_style_ta, final_weight_ta, *_ = tf.while_loop( cond, body, [ time, init_indic, output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup ]) ### time major with tf.variable_scope("output"): output_mel = tf.reshape(output_mel_ta.stack(), shape=(reduced_time_steps, batch_size, OUTPUT_MEL_DIM * self.r)) output_mel = tf.reshape(tf.transpose(output_mel, perm=(1, 0, 2)), shape=(batch_size, MAX_OUT_STEPS, OUTPUT_MEL_DIM)) self.out_mel = output_mel with tf.variable_scope("post-net"): output_post = CBHG(8, (256, OUTPUT_MEL_DIM))( output_mel, sequence_length=None, is_training=self.training, time_major=False) output_spec = tf.layers.dense(output_post, OUTPUT_SPEC_DIM) self.out_stftm = output_spec final_alpha = tf.reshape(final_alpha_ta.stack(), shape=(reduced_time_steps, batch_size, input_time_steps)) self.final_alpha = tf.transpose(final_alpha, perm=(1, 0, 2)) # batch major final_alpha_style = tf.reshape(final_alpha_style_ta.stack(), shape=(reduced_time_steps, batch_size, styles_kind)) self.final_alpha_style = tf.transpose(final_alpha_style, perm=(1, 0, 2)) # batch major final_weight_ta = tf.reshape(final_weight_ta.stack(), shape=(reduced_time_steps, batch_size, 1)) self.final_weight_ta = tf.transpose(final_weight_ta, perm=(1, 0, 2)) # batch major
def __init__(self, inp, inp_mask, decode_time_steps, hyper_params=None, name='Tacotron'): """ Build the computational graph. :param inp: :param inp_mask: :param decode_time_steps: :param hyper_params: :param name: """ super(Tacotron, self).__init__(name) self.hyper_params = HyperParams( ) if hyper_params is None else hyper_params with tf.variable_scope(name): self.global_step = tf.Variable(0, name='global_step', trainable=False) batch_size = tf.shape(inp)[0] input_time_steps = tf.shape(inp)[1] reduc = self.hyper_params.reduction_rate output_time_steps = decode_time_steps * reduc ### Encoder [begin] with tf.variable_scope('character_embedding'): embed_inp = EmbeddingLayer(self.hyper_params.embed_class, self.hyper_params.embed_dim)(inp) with tf.variable_scope('encoder_pre_net'): pre_ed_inp = tf.layers.dropout(tf.layers.dense( embed_inp, 256, tf.nn.relu), training=False) pre_ed_inp = tf.layers.dropout(tf.layers.dense( pre_ed_inp, 128, tf.nn.relu), training=False) encoder_output = modules.cbhg(pre_ed_inp, training=False, k=16, bank_filters=128, projection_filters=(128, 128), highway_layers=4, highway_units=128, bi_gru_units=128, sequence_length=inp_mask, name='encoder_cbhg', reuse=False) ### Encoder [end] ### Attention Module with tf.variable_scope('attention'): att_module = AttentionModule(256, encoder_output, sequence_length=inp_mask, time_major=False) ### Decoder [begin] att_cell = GRUCell(256) dec_cell = MultiRNNCell( [ResidualWrapper(GRUCell(256)) for _ in range(2)]) # prepare output alpha TensorArray with tf.variable_scope('prepare_decode'): # prepare output alpha TensorArray reduced_time_steps = tf.div(output_time_steps, reduc) init_att_cell_state = att_cell.zero_state( batch_size, tf.float32) init_dec_cell_state = dec_cell.zero_state( batch_size, tf.float32) init_state_tup = tuple( [init_att_cell_state, init_dec_cell_state]) init_output_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) init_alpha_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) go_array = tf.zeros( [batch_size, self.hyper_params.seq2seq_dim], dtype=tf.float32) init_context = tf.zeros([batch_size, 256], dtype=tf.float32) init_time = tf.constant(0, dtype=tf.int32) cond = lambda x, *_: tf.less(x, reduced_time_steps) def body(this_time, old_output_ta, old_alpha_ta, old_state_tup, last_context, last_output): with tf.variable_scope('decoder_pre_net'): dec_pre_ed_inp = last_output dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense( dec_pre_ed_inp, 256, tf.nn.relu), training=False) dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense( dec_pre_ed_inp, 128, tf.nn.relu), training=False) with tf.variable_scope('attention_rnn'): att_cell_inp = tf.concat([last_context, dec_pre_ed_inp], axis=-1) att_cell_out, att_cell_state = att_cell( att_cell_inp, old_state_tup[0]) with tf.variable_scope('attention'): query = att_cell_state[0] context, alpha = att_module(query) new_alpha_ta = old_alpha_ta.write(this_time, alpha) with tf.variable_scope('decoder_rnn'): dec_input = tf.layers.dense( tf.concat([att_cell_out, context], axis=-1), 256) dec_cell_out, dec_cell_state = dec_cell( dec_input, old_state_tup[1]) dense_out = tf.layers.dense( dec_cell_out, self.hyper_params.seq2seq_dim * reduc) new_output_ta = old_output_ta.write(this_time, dense_out) new_output = dense_out[:, -self.hyper_params.seq2seq_dim:] new_state_tup = tuple([att_cell_state, dec_cell_state]) return tf.add( this_time, 1 ), new_output_ta, new_alpha_ta, new_state_tup, context, new_output # run loop _, seq2seq_output_ta, alpha_ta, *_ = tf.while_loop( cond, body, [ init_time, init_output_ta, init_alpha_ta, init_state_tup, init_context, go_array ]) with tf.variable_scope('reshape_decode'): seq2seq_output = tf.reshape( seq2seq_output_ta.stack(), shape=(reduced_time_steps, batch_size, self.hyper_params.seq2seq_dim * reduc)) seq2seq_output = tf.reshape( tf.transpose(seq2seq_output, perm=(1, 0, 2)), shape=(batch_size, output_time_steps, self.hyper_params.seq2seq_dim)) self.seq2seq_output = seq2seq_output alpha_output = tf.reshape(alpha_ta.stack(), shape=(reduced_time_steps, batch_size, input_time_steps)) alpha_output = tf.expand_dims( tf.transpose(alpha_output, perm=(1, 0, 2)), -1) self.alpha_output = alpha_output ### Decoder [end] ### PostNet [begin] post_output = modules.cbhg( seq2seq_output, training=False, k=8, bank_filters=128, projection_filters=(256, self.hyper_params.seq2seq_dim), highway_layers=4, highway_units=128, bi_gru_units=128, sequence_length=None, name='decoder_cbhg', reuse=False) post_output = tf.layers.dense(post_output, self.hyper_params.post_dim, name='post_linear_transform') self.post_output = post_output