a.append( np.concatenate([sample[:-1, :5], [[sample[-1][0]]] * (long - 1)], axis=-1)) b.append(sample[:-1, 5:10]) c.append(sample[-1][1]) return a, b, c x = tf.placeholder(shape=[batch_size, long - 1, 6], dtype=tf.float16) y = tf.placeholder(shape=[batch_size, long - 1, 5], dtype=tf.float16) z_ = tf.placeholder(shape=[batch_size], dtype=tf.float16) X = tf.nn.sigmoid(x) - 0.5 Y = tf.nn.sigmoid(y) - 0.5 gru_x = GRUCell(num_units=8, reuse=tf.AUTO_REUSE, activation=tf.nn.elu) state_x = gru_x.zero_state(batch_size, dtype=tf.float16) with tf.variable_scope('RNN_x'): for timestep in range(long - 1): if timestep == 1: tf.get_variable_scope().reuse_variables() (cell_output_x, state_x) = gru_x(X[:, timestep], state_x) out_put_x = state_x gru_y = GRUCell(num_units=8, reuse=tf.AUTO_REUSE, activation=tf.nn.elu) state_y = gru_y.zero_state(batch_size, dtype=tf.float16) with tf.variable_scope('RNN_y'): for timestep in range(long - 1): # be careful if timestep == 1: tf.get_variable_scope().reuse_variables() (cell_output_y, state_y) = gru_y(Y[:, timestep], state_y)
def __init__(self, user_count, item_count, batch_size): hidden_size = 128 long_memory_window = 10 short_memory_window = 3 self.u = tf.placeholder(tf.int32, [ batch_size, ]) # [B] self.i = tf.placeholder(tf.int32, [ batch_size, ]) # [B] self.y = tf.placeholder(tf.float32, [ batch_size, ]) # [B] self.hist = tf.placeholder(tf.int32, [batch_size, long_memory_window]) # [B, T] self.lr = tf.placeholder(tf.float64, []) user_emb_w = tf.get_variable("user_emb_w", [user_count, hidden_size // 2]) item_emb_w = tf.get_variable("item_emb_w", [item_count, hidden_size // 2]) user_b = tf.get_variable( "user_b", [user_count], initializer=tf.constant_initializer(0.0), ) item_b = tf.get_variable("item_b", [item_count], initializer=tf.constant_initializer(0.0)) item_emb = tf.concat([ tf.nn.embedding_lookup(item_emb_w, self.i), tf.nn.embedding_lookup(user_emb_w, self.u), ], axis=1) item_b = tf.gather(item_b, self.i) user_b = tf.gather(user_b, self.u) h_emb = tf.concat([ tf.nn.embedding_lookup( item_emb_w, tf.slice(self.hist, [0, 0], [batch_size, long_memory_window])), tf.tile( tf.expand_dims(tf.nn.embedding_lookup(user_emb_w, self.u), 1), [1, long_memory_window, 1]), ], axis=2) unexp_emb = tf.concat([ tf.nn.embedding_lookup( item_emb_w, tf.slice(self.hist, [0, long_memory_window - short_memory_window], [batch_size, short_memory_window])), tf.tile( tf.expand_dims(tf.nn.embedding_lookup(user_emb_w, self.u), 1), [1, short_memory_window, 1]), ], axis=2) h_long_emb = tf.nn.embedding_lookup( item_emb_w, tf.slice(self.hist, [0, 0], [batch_size, long_memory_window])) h_short_emb = tf.nn.embedding_lookup( item_emb_w, tf.slice(self.hist, [0, long_memory_window - short_memory_window], [batch_size, short_memory_window])) # Long-Short-Term User Preference #with tf.variable_scope('rnn', reuse=tf.AUTO_REUSE): long_output, _ = tf.nn.dynamic_rnn(GRUCell(hidden_size), inputs=h_emb, dtype=tf.float32) long_preference, _ = self.seq_attention(long_output, hidden_size, long_memory_window) long_preference = tf.nn.dropout(long_preference, 0.1) #short_output, _ = tf.nn.dynamic_rnn(GRUCell(hidden_size), inputs=unexp_emb, dtype=tf.float32) #short_preference, _ = self.seq_attention(short_output, hidden_size, long_memory_window) #short_preference = tf.nn.dropout(short_preference, 0.1) #Combine Long-Short-Term-User-Preferences concat = tf.concat([long_preference, item_emb], axis=1) concat = tf.layers.batch_normalization(inputs=concat) concat = tf.layers.dense(concat, 80, activation=tf.nn.sigmoid, name='f1') concat = tf.layers.dense(concat, 40, activation=tf.nn.sigmoid, name='f2') concat = tf.layers.dense(concat, 1, activation=None, name='f3') concat = tf.reshape(concat, [-1]) #Personalized & Contextualized Unexpected Factor unexp_factor = self.unexp_attention(item_emb, unexp_emb, [long_memory_window] * batch_size) unexp_factor = tf.layers.batch_normalization(inputs=unexp_factor) unexp_factor = tf.reshape(unexp_factor, [-1, hidden_size]) unexp_factor = tf.layers.dense(unexp_factor, hidden_size) unexp_factor = tf.layers.dense(unexp_factor, 1, activation=None) #If we choose to use binary values #unexp_gate = tf.to_float(tf.reshape(unexp_gate, [-1]) > 0.5) unexp_factor = tf.reshape(unexp_factor, [-1]) #Unexpectedness (with clustering of user interests) self.center = self.mean_shift(h_long_emb) unexp = tf.reduce_mean(self.center, axis=1) unexp = tf.norm(unexp - tf.nn.embedding_lookup(item_emb_w, self.i), ord='euclidean', axis=1) self.unexp = unexp unexp = tf.exp(-1.0 * unexp) * unexp #Unexpected Activation Function unexp = tf.stop_gradient(unexp) #Relevance (for future exploration) relevance = tf.reduce_mean(h_long_emb, axis=1) relevance = tf.norm(relevance - tf.nn.embedding_lookup(item_emb_w, self.i), ord='euclidean', axis=1) #Annoyance/Diversification (for future exploration) annoyance = tf.reduce_mean(h_short_emb, axis=1) annoyance = tf.norm(annoyance - tf.nn.embedding_lookup(item_emb_w, self.i), ord='euclidean', axis=1) #Estmation of user preference by combing different components self.logits = item_b + concat + user_b + unexp_factor * unexp # [B]exp self.score = tf.sigmoid(self.logits) # Step variable self.global_step = tf.Variable(0, trainable=False, name='global_step') self.global_epoch_step = tf.Variable(0, trainable=False, name='global_epoch_step') self.global_epoch_step_op = tf.assign(self.global_epoch_step, self.global_epoch_step + 1) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.y)) trainable_params = tf.trainable_variables() self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr) gradients = tf.gradients(self.loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm(gradients, 1) self.train_op = self.opt.apply_gradients(zip(clip_gradients, trainable_params), global_step=self.global_step)
def main(model, T, n_iter, n_batch, n_hidden, capacity, comp, FFT, learning_rate, norm, update_gate, activation, lambd, layer_norm, zoneout, visualization_experiment): learning_rate = float(learning_rate) # data params n_input = int(T / 2) + 10 + 1 n_output = 10 n_train = 100000 n_valid = 10000 n_test = 20000 n_steps = T + 3 n_classes = 10 # graph and gradients x = tf.placeholder("int32", [None, n_steps]) y = tf.placeholder("int64", [None]) input_data = tf.one_hot(x, n_input, dtype=tf.float32) # input to hidden if model == "LSTM": cell = BasicLSTMCell(n_hidden, state_is_tuple=True, forget_bias=1) elif model == "GRU": cell = GRUCell(n_hidden, kernel_initializer=tf.orthogonal_initializer()) elif model == "RUM": if activation == "relu": act = tf.nn.relu elif activation == "sigmoid": act = tf.nn.sigmoid elif activation == "tanh": act = tf.nn.tanh elif activation == "softsign": act = tf.nn.softsign if visualization_experiment: # placeholder temp_target = tf.placeholder("float32", [n_hidden + n_input, n_hidden]) temp_target_bias = tf.placeholder("float32", [n_hidden]) temp_embed = tf.placeholder("float32", [n_input, n_hidden]) cell = cell = RUMCell( n_hidden, eta_=norm, update_gate=update_gate, lambda_=lambd, activation=act, use_layer_norm=layer_norm, use_zoneout=zoneout, visualization=visualization_experiment, temp_target=temp_target if visualization_experiment else None, temp_target_bias=temp_target_bias if visualization_experiment else None, temp_embed=temp_embed if visualization_experiment else None) elif model == "EUNN": cell = EUNNCell(n_hidden, capacity, FFT, comp) elif model == "GORU": if visualization_experiment: # placeholder temp_theta0 = tf.placeholder("float32", [n_hidden // 2]) temp_theta1 = tf.placeholder("float32", [n_hidden // 2 - 1]) cell = GORUCell(n_hidden, capacity, FFT, temp_theta0=temp_theta0, temp_theta1=temp_theta1) elif model == "RNN": cell = BasicRNNCell(n_hidden) hidden_out, _ = tf.nn.dynamic_rnn(cell, input_data, dtype=tf.float32) # RESEARCH RELATED # hidden_out = hidden_out[:,:,:50] # costh = hidden_out[:,:,-1] # print(colored(hidden_out,'red')) # print(colored(costh, 'green')) # costh_mean_dist = tf.reduce_mean(costh, axis=0) # costh_hist = tf.summary.histogram('costh',costh_mean_dist) # print(colored(costh_normalized_dist,'yellow')) # hidden to output V_init_val = np.sqrt(6.) / np.sqrt(n_output + n_input) V_weights = tf.get_variable("V_weights", shape=[n_hidden, n_classes], dtype=tf.float32, initializer=tf.random_uniform_initializer( -V_init_val, V_init_val)) V_bias = tf.get_variable("V_bias", shape=[n_classes], dtype=tf.float32, initializer=tf.constant_initializer(0.01)) hidden_out = tf.unstack(hidden_out, axis=1)[-1] temp_out = tf.matmul(hidden_out, V_weights) output_data = tf.nn.bias_add(temp_out, V_bias) # evaluate process cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output_data, labels=y)) tf.summary.scalar('cost', cost) correct_pred = tf.equal(tf.argmax(output_data, 1), y) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) tf.summary.scalar('accuracy', accuracy) # initialization optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(cost) init = tf.global_variables_initializer() # save filename = model + "_H" + str(n_hidden) + "_" + \ ("L" + str(lambd) + "_" if lambd else "") + \ ("E" + str(eta) + "_" if norm else "") + \ ("A" + activation + "_" if activation else "") + \ ("U_" if update_gate else "") + \ ("Z_" if zoneout and model == "RUM" else "") + \ ("ln_" if layer_norm and model == "RUM" else "") + \ (str(capacity) if model in ["EUNN", "GORU"] else "") + \ ("FFT_" if model in ["EUNN", "GORU"] and FFT else "") + \ "B" + str(n_batch) save_path = os.path.join('../../train_log', 'recall', 'T' + str(T), filename) file_manager(save_path) # what follows is task specific filepath = os.path.join(save_path, "eval.txt") if not os.path.exists(os.path.dirname(filepath)): try: os.makedirs(os.path.dirname(filepath)) except OSError as exc: if exc.errno != errno.EEXIST: raise f = open(filepath, 'w') f.write(col("validation \n", 'r')) log(kwargs, save_path) merged_summary = tf.summary.merge_all() saver = tf.train.Saver() parameters_profiler() # train saver = tf.train.Saver() step = 0 train_x, train_y = recall_data(T, n_train) val_x, val_y = recall_data(T, n_valid) test_x, test_y = recall_data(T, n_test) with tf.Session() as sess: sess.run(init) train_writer = tf.summary.FileWriter(save_path, sess.graph) steps = [] losses = [] accs = [] while step < n_iter: batch_x, batch_y = next_batch(train_x, train_y, step, n_batch) # RESEARCH RELATED # acc, loss = \ # sess.run([accuracy, cost], feed_dict={x: batch_x, y: batch_y}) # costh_val = sess.run([costh], feed_dict={x: batch_x, y: batch_y}) # print(colored(costh_val,'green')) # print(colored("###",'yellow')) # acc, loss, costh_h = \ # sess.run([accuracy, cost, costh_hist], feed_dict={x: batch_x, y: # batch_y}) ############## if visualization_experiment: """ initiative to write simpler code """ if model == "RUM": number_of_weights = (n_hidden + n_input) * \ n_hidden + n_hidden + n_input * n_hidden elif model in ["GORU", "EUNN"]: # assuming that n_hidden is even. number_of_weights = n_hidden - 1 print(col("strating linear visualization", 'b')) num_points = 200 coord, weights = generate_points_for_visualization( number_of_weights, num_points) processed_placeholders = process_vis(weights, num_points, n_hidden=n_hidden, cell=model) if model == "RUM": feed_temp_target, feed_temp_target_bias, feed_temp_embed = processed_placeholders else: feed_temp_theta0, feed_temp_theta1 = processed_placeholders collect_losses = [] for i in range(num_points): if model == "RUM": loss = sess.run(cost, feed_dict={ x: batch_x, y: batch_y, temp_target: feed_temp_target[i], temp_target_bias: feed_temp_target_bias[i], temp_embed: feed_temp_embed[i] }) elif model in ["EUNN", "GORU"]: loss = sess.run(cost, feed_dict={ x: batch_x, y: batch_y, temp_theta0: feed_temp_theta0[i], temp_theta1: feed_temp_theta1[i] }) print(col("iter: " + str(i) + " loss: " + str(loss), 'y')) collect_losses.append(loss) np.save(os.path.join(save_path, "linear_height"), np.array(collect_losses)) np.save(os.path.join(save_path, "linear_coord"), np.array(coord)) print(col("done with linear visualization", 'b')) ##################### print(col("strating contour visualization", 'b')) num_points = 20 coord, weights = generate_points_for_visualization( number_of_weights, num_points, type_vis="contour") np.save(os.path.join(save_path, "contour_coord"), np.array(coord)) processed_placeholders = process_vis(weights, num_points**2, n_hidden=n_hidden, cell=model) if model == "RUM": feed_temp_target, feed_temp_target_bias, feed_temp_embed = processed_placeholders else: feed_temp_theta0, feed_temp_theta1 = processed_placeholders collect_contour = np.empty((num_points, num_points)) for i in range(num_points): for j in range(num_points): if model == "RUM": loss = sess.run( cost, feed_dict={ x: batch_x, y: batch_y, temp_target: feed_temp_target[i * num_points + j], temp_target_bias: feed_temp_target_bias[i * num_points + j], temp_embed: feed_temp_embed[i * num_points + j] }) elif model in ["GORU", "EUNN"]: loss = sess.run( cost, feed_dict={ x: batch_x, y: batch_y, temp_theta0: feed_temp_theta0[i * num_points + j], temp_theta1: feed_temp_theta1[i * num_points + j] }) collect_contour[i, j] = loss print( col( "iter: " + str(i) + "," + str(j) + " loss: " + str(loss), 'y')) np.save(os.path.join(save_path, "contour_height"), np.array(collect_contour)) print(col("exiting visualization experiment", 'r')) exit() ############## acc, loss = sess.run([accuracy, cost], feed_dict={ x: batch_x, y: batch_y }) # writer.add_summary(costh_h, step) # RESEARCH RELATED print( col( "Iter " + str(step) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc), 'g')) sess.run(optimizer, feed_dict={x: batch_x, y: batch_y}) steps.append(step) losses.append(loss) accs.append(acc) if step % 1000 == 0: summ = sess.run(merged_summary, feed_dict={x: val_x, y: val_y}) acc = sess.run(accuracy, feed_dict={x: val_x, y: val_y}) loss = sess.run(cost, feed_dict={x: val_x, y: val_y}) train_writer.add_summary(summ, step) print("Validation Loss= " + "{:.6f}".format(loss) + ", Validation Accuracy= " + "{:.5f}".format(acc)) f.write(col("%d\t%f\t%f\n" % (step, loss, acc), 'y')) f.flush if step % 1000 == 1: print(col("saving graph and metadata in " + save_path, "b")) saver.save(sess, os.path.join(save_path, "model")) step += 1 print(col("Optimization Finished!", 'b')) # test test_acc = sess.run(accuracy, feed_dict={x: test_x, y: test_y}) test_loss = sess.run(cost, feed_dict={x: test_x, y: test_y}) f.write( col( "Test result: Loss= " + "{:.6f}".format(test_loss) + ", Accuracy= " + "{:.5f}".format(test_acc), 'g')) f.close()
def initialize(self, inputs, input_lengths, lpc_targets=None, stop_token_targets=None, is_training=True): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. lpc_targets: float32 Tensor with shape [N, T_out, M], where M is feature dim ''' with tf.variable_scope('inference'): batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, embed_depth=256] embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # Encoder # [N, T_in, prenet_depths[-1]=128] prenet_outputs = prenet(embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, encoder_depth=256] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training, hp.encoder_depth) # Location sensitive attention attention_mechanism = LocationSensitiveAttention( hp.attention_depth, encoder_outputs) # [N, T_in, attention_depth=256] # Decoder (layers specified bottom to top): multi_rnn_cell = MultiRNNCell( [ ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Frames Projection layer frame_projection = FrameProjection( hp.num_lpcs * hp.outputs_per_step) # [N, T_out/r, M*r] # <stop_token> projection layer stop_projection = StopProjection( is_training, shape=hp.outputs_per_step) # [N, T_out/r, r] # Project onto r mel spectrograms (predict r outputs at each RNN step): decoder_cell = TacotronDecoderWrapper(is_training, attention_mechanism, multi_rnn_cell, frame_projection, stop_projection) if is_training: helper = TacoTrainingHelper(inputs, lpc_targets, hp.num_lpcs, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_lpcs, hp.outputs_per_step) decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, stop_token_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( CustomDecoder(decoder_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry # [N, T_out, M] lpc_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_lpcs]) stop_token_outputs = tf.reshape(stop_token_outputs, [batch_size, -1]) # [N, T_out, M] # # Add post-processing CBHG: # # [N, T_out, postnet_depth=256] # post_outputs = post_cbhg( # , hp.num_mels, is_training, hp.postnet_depth) # # [N, T_out, F] # linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.stop_token_outputs = stop_token_outputs self.alignments = alignments self.lpc_outputs = lpc_outputs self.lpc_targets = lpc_targets self.stop_token_targets = stop_token_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' prenet out: {}'.format(prenet_outputs.shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out (r frames): {}'.format(decoder_outputs.shape)) log(' decoder out (1 frame): {}'.format(lpc_outputs.shape)) # log(' postnet out: {}'.format(post_outputs.shape)) # log(' linear out: {}'.format(linear_outputs.shape)) log(' stop token: {}'.format( stop_token_outputs.shape))
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training2 = linear_targets is not None # test에서 이게 True로 되는데, 이게 의도한 것인가??? is_training = not rnn_decoder_test_mode self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: # speaker_embedding_size = f(16) speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed( speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway" ) # 'enc_prenet_sizes': [f(256), f(128)] encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [ get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) for idx in range(hp.dec_layer_num) ] else: deep_dense = lambda x, dim: tf.layers.dense( x, dim, activation=tf.nn.softsign ) # softsign: x / (abs(x) + 1) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다. before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type)) else: # self.num_speakers =1인 경우 speaker_embed = None before_highway = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet' ) # 'enc_prenet_sizes': [f(256), f(128)], dropout_prob = 0.5 # ==> (N, T_in, 128) # enc_rnn_size = 128 encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) # single: attention_size = 128 if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_mon_norm_hccho': attention_mechanism = BahdanauMonotonicAttention_hccho( hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다. # carpedm20은 tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만, keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다. attention_cell = AttentionWrapper( GRUCell(hp.attention_state_size), attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 dec_prenet_outputs = DecoderPrenetWrapper( attention_cell, speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) # dec_prenet_sizes = [f(256), f(128)] # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다. # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ] concat_cell = ConcatOutputAndAttentionWrapper( dec_prenet_outputs, embed_to_concat=speaker_embed ) # concat(output,attention,speaker_embed)해서 새로운 output을 만든다. # Decoder (layers specified bottom to top): dec_rnn_size= 256 cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size) ] # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데... for _ in range(hp.dec_layer_num): # hp.dec_layer_num = 2 cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor ) # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까??? (hp.num_mels+1) * hp.reduction_factor decoder_init_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. ) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training2: # rnn_decoder_test_mode = True if test mode, train mode에서는 False helper = TacoTrainingHelper( inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters) # max_iters=200 # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다. ==> final_decoder_state[0] alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) JA = config.max_answer_length JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): # Char-CNN Embedding if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) # Word Embedding if config.use_word_emb: with tf.variable_scope("emb_var") as scope, tf.device( "/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') tf.get_variable_scope().reuse_variables() self.word_emb_scope = scope if config.use_glove_for_unk: word_emb_mat = tf.concat( [word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq # Concat Char-CNN Embedding and Word Embedding if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # exact match if config.use_exact_match: emx = tf.expand_dims(tf.cast(self.emx, tf.float32), -1) xx = tf.concat([xx, emx], 3) # [N, M, JX, di+1] emq = tf.expand_dims(tf.cast(self.emq, tf.float32), -1) qq = tf.concat([qq, emq], 2) # [N, JQ, di+1] # 2 layer highway network on Concat Embedding if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # Bidirection-LSTM (3rd layer on paper) cell = GRUCell(d) if config.GRU else BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] flat_x_len = flatten(x_len, 0) # [N * M] with tf.variable_scope("prepro"): if config.use_fused_lstm: with tf.variable_scope("u1"): fw_inputs = tf.transpose( qq, [1, 0, 2]) #[time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, q_len, batch_dim=1, seq_dim=0) fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0) prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0) fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=q_len, scope="fw") bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=q_len, scope="bw") bw_outputs = tf.reverse_sequence(bw_outputs, q_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) output = tf.transpose(current_inputs, [1, 0, 2]) u = output flat_xx = flatten(xx, 2) # [N * M, JX, d] if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() with tf.variable_scope("u1"): fw_inputs = tf.transpose( flat_xx, [1, 0, 2]) #[time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0) # fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) # bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) fw_outputs, fw_final = prep_fw_cell( fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw") bw_outputs, bw_final = prep_bw_cell( bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw") bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) output = tf.transpose(current_inputs, [1, 0, 2]) else: with tf.variable_scope("h1"): fw_inputs = tf.transpose( flat_xx, [1, 0, 2]) #[time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0) # fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) # bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0) prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0) fw_outputs, fw_final = prep_fw_cell( fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw") bw_outputs, bw_final = prep_bw_cell( bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw") bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) output = tf.transpose(current_inputs, [1, 0, 2]) h = tf.expand_dims(output, 1) # [N, M, JX, 2d] else: (fw_u, bw_u), _ = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat([fw_u, bw_u], 2) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h # Attention Flow Layer (4th layer on paper) with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, size=d, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell tp0 = p0 # Modeling layer (5th layer on paper) with tf.variable_scope('modeling_layer'): if config.use_fused_lstm: g1, encoder_state_final = build_fused_bidirectional_rnn( inputs=p0, num_units=config.hidden_size, num_layers=config.num_modeling_layers, inputs_length=flat_x_len, input_keep_prob=config.input_keep_prob, scope='modeling_layer_g') else: for layer_idx in range(config.num_modeling_layers - 1): (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope="g_{}".format(layer_idx)) # [N, M, JX, 2d] p0 = tf.concat([fw_g0, bw_g0], 3) (fw_g1, bw_g1), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) # [N, M, JX, 2d] # Self match layer if config.use_self_match: s0 = tf.reshape(g1, [N * M, JX, 2 * d]) # [N * M, JX, 2d] x_mask = tf.reshape(self.x_mask, [N * M, JX]) # [N * M, JX] if config.use_static_self_match: with tf.variable_scope( "StaticSelfMatch" ): # implemented follow r-net section 3.3 W_x_Vj = tf.contrib.layers.fully_connected( # [N * M, JX, d] s0, int(d / 2), scope='row_first', activation_fn=None, biases_initializer=None) W_x_Vt = tf.contrib.layers.fully_connected( # [N * M, JX, d] s0, int(d / 2), scope='col_first', activation_fn=None, biases_initializer=None) sum_rc = tf.add( # [N * M, JX, JX, d] tf.expand_dims(W_x_Vj, 1), tf.expand_dims(W_x_Vt, 2)) v = tf.get_variable('second', shape=[1, 1, 1, int(d / 2)], dtype=tf.float32) Sj = tf.reduce_sum(tf.multiply(v, tf.tanh(sum_rc)), -1) # [N * M, JX, JX] Ai = softmax(Sj, mask=tf.expand_dims(x_mask, 1)) # [N * M, JX, JX] Ai = tf.expand_dims(Ai, -1) # [N * M, JX, JX, 1] Vi = tf.expand_dims(s0, 1) # [N * M, 1, JX, 2d] Ct = tf.reduce_sum( # [N * M, JX, 2d] tf.multiply(Ai, Vi), axis=2) inputs_Vt_Ct = tf.concat([s0, Ct], 2) # [N * M, JX, 4d] if config.use_fused_lstm: fw_inputs = tf.transpose( inputs_Vt_Ct, [1, 0, 2]) # [time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0) fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0) prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0) fw_outputs, fw_s_f = prep_fw_cell( fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw") bw_outputs, bw_s_f = prep_bw_cell( bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw") fw_s_f = LSTMStateTuple(c=fw_s_f[0], h=fw_s_f[1]) bw_s_f = LSTMStateTuple(c=bw_s_f[0], h=bw_s_f[1]) bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) s1 = tf.transpose(current_inputs, [1, 0, 2]) else: (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn( first_cell, first_cell, inputs_Vt_Ct, flat_x_len, dtype='float', scope='s') # [N, M, JX, 2d] s1 = tf.concat([fw_s, bw_s], 2) # [N * M, JX, 2d], M == 1 else: with tf.variable_scope("DynamicSelfMatch"): first_cell = AttentionCell(cell, s0, size=d, mask=x_mask, is_train=self.is_train) (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn( first_cell, first_cell, s0, x_len, dtype='float', scope='s') # [N, M, JX, 2d] s1 = tf.concat([fw_s, bw_s], 2) # [N * M, JX, 2d], M == 1 g1 = tf.expand_dims(s1, 1) # [N, M, JX, 2d] # prepare for PtrNet encoder_output = g1 # [N, M, JX, 2d] encoder_output = tf.expand_dims(tf.cast(self.x_mask, tf.float32), -1) * encoder_output # [N, M, JX, 2d] if config.use_self_match or not config.use_fused_lstm: if config.GRU: encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: if isinstance(fw_s_f, LSTMStateTuple): encoder_state_c = tf.concat((fw_s_f.c, bw_s_f.c), 1, name='encoder_concat_c') encoder_state_h = tf.concat((fw_s_f.h, bw_s_f.h), 1, name='encoder_concat_h') encoder_state_final = LSTMStateTuple(c=encoder_state_c, h=encoder_state_h) elif isinstance(fw_s_f, tf.Tensor): encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: encoder_state_final = None tf.logging.error("encoder_state_final not set") print("encoder_state_final:", encoder_state_final) with tf.variable_scope("output"): # eos_symbol = config.eos_symbol # next_symbol = config.next_symbol tf.assert_equal( M, 1) # currently dynamic M is not supported, thus we assume M==1 answer_string = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.int32, name='answer_string') # [N, M, JA + 1] answer_string_mask = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.bool, name='answer_string_mask') # [N, M, JA + 1] answer_string_length = tf.placeholder( shape=(N, 1), dtype=tf.int32, name='answer_string_length', ) # [N, M] self.tensor_dict['answer_string'] = answer_string self.tensor_dict['answer_string_mask'] = answer_string_mask self.tensor_dict['answer_string_length'] = answer_string_length self.answer_string = answer_string self.answer_string_mask = answer_string_mask self.answer_string_length = answer_string_length answer_string_flattened = tf.reshape(answer_string, [N * M, JA + 1]) self.answer_string_flattened = answer_string_flattened # [N * M, JA+1] print("answer_string_flattened:", answer_string_flattened) answer_string_length_flattened = tf.reshape( answer_string_length, [N * M]) self.answer_string_length_flattened = answer_string_length_flattened # [N * M] print("answer_string_length_flattened:", answer_string_length_flattened) decoder_cell = GRUCell(2 * d) if config.GRU else BasicLSTMCell( 2 * d, state_is_tuple=True) with tf.variable_scope("Decoder"): decoder_train_logits = ptr_decoder( decoder_cell, tf.reshape(tp0, [N * M, JX, 2 * d]), # [N * M, JX, 2d] tf.reshape(encoder_output, [N * M, JX, 2 * d]), # [N * M, JX, 2d] flat_x_len, encoder_final_state=encoder_state_final, max_encoder_length=config.sent_size_th, decoder_output_length= answer_string_length_flattened, # [N * M] batch_size=N, # N * M (M=1) attention_proj_dim=self.config.decoder_proj_dim, scope='ptr_decoder' ) # [batch_size, dec_len*, enc_seq_len + 1] self.decoder_train_logits = decoder_train_logits print("decoder_train_logits:", decoder_train_logits) self.decoder_train_softmax = tf.nn.softmax( self.decoder_train_logits) self.decoder_inference = tf.argmax( decoder_train_logits, axis=2, name='decoder_inference') # [N, JA + 1] self.yp = tf.ones([N, M, JX], dtype=tf.int32) * -1 self.yp2 = tf.ones([N, M, JX], dtype=tf.int32) * -1
def cbhg(inputs, input_lengths, activation=tf.nn.relu, speaker_embd=None, is_training=True, K=16, c=(128, 128), gru_units=128, num_highways=4, scope="cbhg"): with tf.variable_scope(scope): conv_bank = conv1d_banks(inputs, K=K, activation=activation, is_training=is_training) # (N, T_x, K*E/2) # Maxpooling: conv_proj = tf.layers.max_pooling1d(conv_bank, pool_size=2, strides=1, padding='same') # Projection layers: for i, layer_size in enumerate(c[:-1]): conv_proj = conv1d(conv_bank, 3, layer_size, activation, is_training, 'proj_{}'.format(i + 1)) conv_proj = conv1d(conv_proj, 3, c[-1], None, is_training, 'proj_{}'.format(len(c))) # Residual connection: highway_input = conv_proj + inputs # Handle dimensionality mismatch: if highway_input.shape[2] != 128: highway_input = tf.layers.dense(highway_input, 128) # 4-layer HighwayNet: h = highway_input for i in range(num_highways): with tf.variable_scope('highway_' + str(i)): # site specific speaker embedding if speaker_embd is not None: s = tf.layers.dense(speaker_embd, h.shape[-1], activation=tf.nn.softsign) s = tf.tile(tf.expand_dims(s, 1), [1, tf.shape(h)[1], 1]) h = tf.concat([h, s], -1) h = highwaynet(h) # site specific speaker embedding if speaker_embd is not None: # TODO: what about two different s1, s2 for forwards and backwards s = tf.layers.dense(speaker_embd, gru_units, activation=tf.nn.softsign) else: s = None # Bidirectional RNN outputs, states = tf.nn.bidirectional_dynamic_rnn( GRUCell(gru_units), GRUCell(gru_units), h, initial_state_fw=s, initial_state_bw=s, sequence_length=input_lengths, dtype=tf.float32) encoded = tf.concat(outputs, axis=2) # Concat forward and backward return encoded
def __init__(self, num_items, num_embed_units, num_units, num_layers, embed=None, learning_rate=1e-4, action_num=10, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, use_lstm=True): self.epoch = tf.Variable(0, trainable=False, name='agn/epoch') self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.sessions_input = tf.placeholder(tf.int32, shape=(None, None)) self.rec_lists = tf.placeholder(tf.int32, shape=(None, None, None)) self.rec_mask = tf.placeholder(tf.float32, shape=(None, None, None)) self.aims_idx = tf.placeholder(tf.int32, shape=(None, None)) self.sessions_length = tf.placeholder(tf.int32, shape=(None)) self.reward = tf.placeholder(tf.float32, shape=(None)) if embed is None: self.embed = tf.get_variable( 'agn/embed', [num_items, num_embed_units], tf.float32, initializer=tf.truncated_normal_initializer(0, 1)) else: self.embed = tf.get_variable('agn/embed', dtype=tf.float32, initializer=embed) batch_size, encoder_length, rec_length = tf.shape( self.sessions_input)[0], tf.shape( self.sessions_input)[1], tf.shape(self.rec_lists)[2] encoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.sessions_length - 2, encoder_length), reverse=True, axis=1), [-1, encoder_length]) # [batch_size, length] self.sessions_target = tf.concat([ self.sessions_input[:, 1:], tf.ones([batch_size, 1], dtype=tf.int32) * PAD_ID ], 1) # [batch_size, length, embed_units] self.encoder_input = tf.nn.embedding_lookup(self.embed, self.sessions_input) # [batch_size, length, rec_length] self.aims = tf.one_hot(self.aims_idx, rec_length) if use_lstm: cell = MultiRNNCell( [LSTMCell(num_units) for _ in range(num_layers)]) else: cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # Training with tf.variable_scope("agn"): output_fn, sampled_sequence_loss = output_projection_layer( num_units, num_items) self.encoder_output, self.encoder_state = dynamic_rnn( cell, self.encoder_input, self.sessions_length, dtype=tf.float32, scope="encoder") tmp_dim_1 = tf.tile( tf.reshape(tf.range(batch_size), [batch_size, 1, 1, 1]), [1, encoder_length, rec_length, 1]) tmp_dim_2 = tf.tile( tf.reshape(tf.range(encoder_length), [1, encoder_length, 1, 1]), [batch_size, 1, rec_length, 1]) # [batch_size, length, rec_length, 3] gather_idx = tf.concat( [tmp_dim_1, tmp_dim_2, tf.expand_dims(self.rec_lists, 3)], 3) # [batch_size, length, num_items], [batch_size*length] y_prob, local_loss, total_size = sampled_sequence_loss( self.encoder_output, self.sessions_target, encoder_mask) # Compute recommendation rank given rec_list # [batch_size, length, num_items] y_prob = tf.reshape(y_prob, [batch_size, encoder_length, num_items]) * \ tf.concat([tf.zeros([batch_size, encoder_length, 2], dtype=tf.float32), tf.ones([batch_size, encoder_length, num_items-2], dtype=tf.float32)], 2) # [batch_size, length, rec_len] ini_prob = tf.reshape(tf.gather_nd(y_prob, gather_idx), [batch_size, encoder_length, rec_length]) # [batch_size, length, rec_len] mul_prob = ini_prob * self.rec_mask # [batch_size, length, action_num] _, self.index = tf.nn.top_k(mul_prob, k=action_num) # [batch_size, length, metric_num] _, self.metric_index = tf.nn.top_k(mul_prob, k=(FLAGS.metric + 1)) self.loss = tf.reduce_sum( tf.reshape(self.reward, [-1]) * local_loss) / total_size # Inference with tf.variable_scope("agn", reuse=True): # tf.get_variable_scope().reuse_variables() self.lstm_state = tf.placeholder(tf.float32, shape=(2, 2, None, num_units)) self.ini_state = (tf.contrib.rnn.LSTMStateTuple( self.lstm_state[0, 0, :, :], self.lstm_state[0, 1, :, :]), tf.contrib.rnn.LSTMStateTuple( self.lstm_state[1, 0, :, :], self.lstm_state[1, 1, :, :])) # [batch_size, length, num_units] self.encoder_output_predict, self.encoder_state_predict = dynamic_rnn( cell, self.encoder_input, self.sessions_length, initial_state=self.ini_state, dtype=tf.float32, scope="encoder") # [batch_size, num_units] self.final_output_predict = tf.reshape( self.encoder_output_predict[:, -1, :], [-1, num_units]) # [batch_size, num_items] self.rec_logits = output_fn(self.final_output_predict) # [batch_size, action_num] _, self.rec_index = tf.nn.top_k( self.rec_logits[:, len(_START_VOCAB):], action_num) self.rec_index += len(_START_VOCAB) def gumbel_max(inp, alpha, beta): # assert len(tf.shape(inp)) == 2 g = tf.random_uniform(tf.shape(inp), 0.0001, 0.9999) g = -tf.log(-tf.log(g)) inp_g = tf.nn.softmax( (tf.nn.log_softmax(inp / 1.0) + g * alpha) * beta) return inp_g # [batch_size, action_num] _, self.random_rec_index = tf.nn.top_k( gumbel_max(self.rec_logits[:, len(_START_VOCAB):], 1, 1), action_num) self.random_rec_index += len(_START_VOCAB) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.params = tf.trainable_variables() gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=100, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def initialize(self, txt_targets_A, txt_lenth_A, txt_targets_B, txt_lenth_B, mel_targets, image_targets): #with tf.variable_scope('inference') as scope: is_training = mel_targets is not None #is_teacher_force_generating = mel_targets is not None batch_size = tf.shape(mel_targets)[0] hp = self._hparams # Embeddings for text embedding_table = tf.get_variable( 'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_txt_inputs_A = tf.nn.embedding_lookup(embedding_table, txt_targets_A) #[N, T_in, 128] embedded_txt_inputs_B = tf.nn.embedding_lookup(embedding_tabel, txt_targets_B) #------------------------ Encoder Scope---------------------------------------------- # 'e space': outputs from Modality Encoders # Text Encoder with tf.variable_scope('E_text', reuse = tf.AUTO_REUSE) as scope: prenet_outputs_A = prenet(embedded_txt_inputs_A, is_training) # [N, T_in, 128] prenet_outputs_B = prenet(embedded_txt_inputs_B, is_training) # [N, T_in, 128] cbhg_outputs_A = encoder_cbhg(prenet_outputs_A, input_lengths_A, is_training) cbhg_outputs_B = encoder_cbhg(prenet_outputs_B, input_lengths_B, is_training) txt_encoder_outputs_A = text_encoder(cbhg_outputs_A, is_training) txt_encoder_outputs_B = text_encoder(cbhg_outputs_B, is_training) self.e_txt_A = txt_encoder_outputs_A self.e_txt_B = txt_encoder_outputs_B # Speech Encoder with tf.variable_scope('E_speech', reuse = tf.AUTO_REUSE) as scope: speech_outputs = reference_encoder( mel_targets, filters=hp.reference_filters, kernel_size=(3,3), strides=(2,2), encoder_cell=GRUCell(hp.reference_depth), is_training=is_training) # [N, 256] self.e_speech = speech_outputs # Image Encoder with tf.variable_scope('E_image', reuse = tf.AUTO_REUSE) as scope: img_outputs = image_encoder( is_training=is_training, norm='batch', image_size = 128) self.e_img = img_outputs #-------------------------Universal Computing Body------------------------------------ # Modality Transformer T with tf.variable_scope('T', reuse = tf.AUTO_REUSE) as scope: # 'z space': output from Modality Transformer self.z_img = modality_transformer(self.e_img, is_training = is_training) self.z_txt_A = modality_transformer(self.e_txt_A, is_training = is_training) self.z_txt_B = modality_transformer(self.e_txt_B, is_training = is_training) self.z_speech = modality_transformer(self.e_speech, is_training = is_training) # Modality Classifier C with tf.variable_scope('C', reuse = tf.AUTO_REUSE) as scope: self.c_logit_img = modality_classifier(self.z_img, is_training = is_training) c_logit_txt_A = modality_classifier(self.z_txt_A, is_training = is_training) c_logit_txt_B = modality_classifier(self.z_txt_B, is_training = is_training) self.c_logit_txt = c_logit_txt_A + c_logit_txt_B self.c_logit_speech = modality_classifier(self.z_speech, is_training =is_training) # Memory Fusion Module M with tf.variable_scope('M', reuse = tf.AUDO_REUSE) as scope: # Global tokens tokens = tf.get_variable( 'global_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.tokens = tokens # Multi-head Attention attention_img = MultiheadAttention( tf.expand_dims(self.z_img,axis=1), # [N, 1, 256] tf.tanh(tf.tile(tf.expand_dims(tokens, axis=0), [batch_size,1,1])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) attention_speech = MultiheadAttention( tf.expand_dims(self.z_speech,axis=1), # [N, 1, 256] tf.tanh(tf.tile(tf.expand_dims(tokens, axis=0), [batch_size,1,1])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) attention_txt_A = MultiheadAttention( tf.expand_dims(self.z_txt_A, axis=1), tf.tanh(tf.tile(tf.expand_dims(tokens, axis=0), [batch_size,1,1])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) attention_txt_B = MultiheadAttention( tf.expand_dims(self.z_txt_B, axis=1), tf.tanh(tf.tile(tf.expand_dims(tokens, axis=0), [batch_size,1,1])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) output_img = attention_img.multi_head_attention() # [N, 1, 256] output_txt_A = attention_txt_A.multi_head_attention() output_txt_B = attention_txt_B.multi_head_attention() output_speech = attention_speech.multi_head_attention() # 'u space': output form Memory Fusion Module self.u_img = output_img self.u_speech = output_speech self.u_txt_A = output_txt_A self.u_txt_B = output_txt_B #---------------Decoder Scopt--------------------------------------------------------- # Image Decoder scope with tf.variable_scope('D_img') as scope: fake_img = image_decoder( self.u_img, is_train = self.is_training) self.fake_img = fake_img # Speech Decoder scope with tf.variable_scope('D_speech') as scope: # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, self.u_speech, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.rnn_depth), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry fake_mel = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] self.fake_mel = fake_mel self.txt_targets_A = txt_targets_A self.txt_lengths_B = txt_lengths_B self.mel_targets = mel_targets self.image_targets = image_targets
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, pml_targets=None, gta=False, locked_alignments=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder features. Only needed for training. gta: boolean flag that is set to True when ground truth alignment is required locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this parameter and the attention alignments are locked to these values ''' # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including # batch dimension locked_alignments_ = locked_alignments if locked_alignments_ is not None: if np.ndim(locked_alignments_) < 3: locked_alignments_ = np.expand_dims(locked_alignments_, 0) with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet(embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) # Attention attention_mechanism = BahdanauAttention(hp.attention_depth, encoder_outputs) attention_cell = LockableAttentionWrapper( GRUCell(hp.attention_depth), attention_mechanism, alignment_history=True, locked_alignments=locked_alignments_, output_attention=False, name='attention_wrapper') # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. prenet_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper(prenet_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: # original shape is: (decoder_steps, time_steps, encoder_steps) # end shape is: (time_steps, encoder_steps, decoder_steps) alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.attention_mechanism = attention_mechanism self.attention_cell = attention_cell log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
if __name__ == '__main__': try: from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple, GRUCell except ImportError: LSTMCell = tf.nn.rnn_cell.LSTMCell LSTMStateTuple = tf.nn.rnn_cell.LSTMStateTuple GRUCell = tf.nn.rnn_cell.GRUCell tf.reset_default_graph() with tf.Session() as session: model = HANClassifierModel( vocab_size=10, embedding_size=5, classes=2, fw_word_cell=GRUCell(10), bw_word_cell=GRUCell(10), fw_sentence_cell=GRUCell(10), bw_sentence_cell=GRUCell(10), word_output_size=10, sentence_output_size=10, max_grad_norm=5.0, dropout_keep_proba=0.5, ) session.run(tf.global_variables_initializer()) fd = { model.is_training: False, model.inputs: [[ [5, 4, 1, 0], [3, 3, 6, 7],
X_test = zero_pad(X_test, SEQUENCE_LENGTH) # Different placeholders batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH]) target_ph = tf.placeholder(tf.float32, [None]) seq_len_ph = tf.placeholder(tf.int32, [None]) keep_prob_ph = tf.placeholder(tf.float32) # Embedding layer embeddings_var = tf.Variable(tf.random_uniform( [vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True) batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph) # (Bi-)RNN layer(-s) rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32) # Attention layer attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True) # Dropout drop = tf.nn.dropout(attention_output, keep_prob_ph) # Fully connected layer W = tf.Variable(tf.truncated_normal(
def presentation_transformer(self, inputs, inputs_actual_length): with tf.variable_scope('presentation_layer', reuse=tf.AUTO_REUSE): with tf.name_scope('structure_presentation_layer'): # 正向 fw_cell = GRUCell(num_units=self.hidden_num) fw_drop_cell = DropoutWrapper(fw_cell, output_keep_prob=self.dropout) # 反向 bw_cell = GRUCell(num_units=self.hidden_num) bw_drop_cell = DropoutWrapper(bw_cell, output_keep_prob=self.dropout) # 动态rnn函数传入的是一个三维张量,[batch_size,n_steps,n_input] 输出是一个元组 每一个元素也是这种形状 if self.is_train and not self.is_extract: output, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_drop_cell, cell_bw=bw_drop_cell, inputs=inputs, sequence_length=inputs_actual_length, dtype=tf.float32) else: output, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=fw_cell, cell_bw=bw_cell, inputs=inputs, sequence_length=inputs_actual_length, dtype=tf.float32) # hiddens的长度为2,其中每一个元素代表一个方向的隐藏状态序列,将每一时刻的输出合并成一个输出 structure_output = tf.concat(output, axis=2) structure_output = self.layer_normalization(structure_output) with tf.name_scope('transformer_layer'): transformer_output = self.encoder_stack( structure_output, self.is_train) with tf.name_scope('global_attention_layer'): w_omega = tf.get_variable( name='w_omega', shape=[self.hidden_num * 2, self.attention_num], initializer=tf.random_normal_initializer()) b_omega = tf.get_variable( name='b_omega', shape=[self.attention_num], initializer=tf.random_normal_initializer()) u_omega = tf.get_variable( name='u_omega', shape=[self.attention_num], initializer=tf.random_normal_initializer()) v = tf.tanh( tf.tensordot(transformer_output, w_omega, axes=1) + b_omega) vu = tf.tensordot(v, u_omega, axes=1, name='vu') # (B,T) shape alphas = tf.nn.softmax(vu, name='alphas') # (B,T) shape # tf.expand_dims用于在指定维度增加一维 global_attention_output = tf.reduce_sum( transformer_output * tf.expand_dims(alphas, -1), 1) return global_attention_output
input_x = tf.placeholder(tf.int32, [BATCH_SIZE, None]) input_y = tf.placeholder(tf.int32, [BATCH_SIZE, Y_Class]) input_s = tf.placeholder(tf.int32, [BATCH_SIZE, None, SEN_CLASS]) sen_len_ph = tf.placeholder(tf.int32) keep_prob_ph = tf.placeholder(tf.float32) #Embedding Layer emd_file = open(all_path + "emb_array.pkl", "rb") emb_array = pickle.load(emd_file) emd_file.close() embeddings = tf.Variable(emb_array, trainable=True) input_emd = tf.nn.embedding_lookup(embeddings, input_x) #shape= (B, None, E) #normal bi_GRU (f_out, b_out), _ = bi_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE), input_emd, sequence_length=length(input_emd), dtype=tf.float32) gru_out = tf.concat((f_out, b_out), axis=2) #RNN # gru_out, _ = dynamic_rnn(BasicRNNCell(HIDDEN_SIZE), input_emd, sequence_length=length(input_emd), dtype=tf.float32) #Attention Layer # attention_output, alphas = attentionMulti(gru_out, ATTENTION_SIZE, input_s, BATCH_SIZE, sen_len_ph) attention_output, w_a, b_omega, u_omega = attention(gru_out, ATTENTION_SIZE) hidden_size = input_emd.shape[2].value
def p_cbhg(inputs, input_lengths, is_training, scope, K, projections, depth): """ Args: inputs: input tensor input_lengths: length of input tensor is_training: Batch Normalization option in Conv1D scope: network or model name K: kernel size range projections: projection layers option depth: dimensionality option of Highway net and Bidirectical GRU's output The layers in the code are staked in the order in which they came out. """ with tf.variable_scope(scope): with tf.variable_scope('p_conv_bank'): conv_outputs = tf.concat( [ conv1d(inputs, k, 128, tf.nn.relu, is_training, 'p_conv1d_%d' % k) for k in range(1, K + 1) ], #1D Convolution layers using multiple types of Convolution Kernel. axis=-1 #Iterate K with increasing filter size by 1. ) # Convolution bank: concatenate on the last axis to stack channels from all convolutions # Maxpooling: maxpool_output = tf.layers.max_pooling1d( conv_outputs, pool_size=2, strides=1, padding='same') #1D Maxpooling layer(strides=1, width=2) # Two projection layers: proj1_output = conv1d(maxpool_output, 3, projections[0], tf.nn.relu, is_training, 'p_proj_1') #1st Conv1D projections proj2_output = conv1d(proj1_output, 3, projections[1], None, is_training, 'p_proj_2') #2nd Conv1D projections # Residual connection: highway_input = proj2_output + inputs #Highway net input with residual connection half_depth = depth // 2 assert half_depth * 2 == depth, 'encoder and postnet depths must be even.' #assert depth to be even # Handle dimensionality mismatch: if highway_input.shape[ 2] != half_depth: #check input's dimensionality and output's dimensionality are the same highway_input = tf.layers.dense( highway_input, half_depth ) #change input's channel size to Highway net output's size # 4-layer HighwayNet: for i in range(4): highway_input = highwaynet(highway_input, 'p_highway_%d' % (i + 1), half_depth) #make 4 Highway net layers rnn_input = highway_input # Bidirectional GRU outputs, states = tf.nn.bidirectional_dynamic_rnn( #make Bidirectional GRU GRUCell(half_depth), GRUCell(half_depth), rnn_input, sequence_length=input_lengths, dtype=tf.float32) return tf.concat( outputs, axis=2) # Concat forward sequence and backward sequence
def __init__(self, sequence_length_head, sequence_length_body, num_classes, vocab_size_head, vocab_size_body, embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.1): self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.input_x_head = tf.placeholder(tf.int32, [None, sequence_length_head], name="input_x_head") self.input_x_body = tf.placeholder(tf.int32, [None, sequence_length_body], name="input_x_body") # Embedding layer self.embeddings_head = tf.Variable(tf.random_uniform( [vocab_size_head, embedding_size], -1.0, 1.0), trainable=False) #trainable=false self.embedded_chars_head = tf.nn.embedding_lookup( self.embeddings_head, self.input_x_head) self.embedded_chars_expanded_head = tf.expand_dims( self.embedded_chars_head, -1) self.embeddings_body = tf.Variable(tf.random_uniform( [vocab_size_body, embedding_size], -1.0, 1.0), trainable=False) #trainable=false self.embedded_chars_body = tf.nn.embedding_lookup( self.embeddings_body, self.input_x_body) self.embedded_chars_expanded_body = tf.expand_dims( self.embedded_chars_body, -1) #2. LSTM LAYER ###################################################################### with tf.variable_scope("lstm-head") as scope: #self.lstm_cell_head = tf.contrib.rnn.LSTMCell(embedding_size,state_is_tuple=True) #self.lstm_out_head,self.lstm_state_head = tf.nn.dynamic_rnn(self.lstm_cell_head,self.embedded_chars_head,dtype=tf.float32) #self.lstm_out_expanded_head = tf.expand_dims(self.lstm_out_head, -1) self.lstm_out_head, self.lstm_state_head = bi_rnn( GRUCell(embedding_size), GRUCell(embedding_size), inputs=self.embedded_chars_head, dtype=tf.float32) self.lstm_out_merge_head = tf.concat(self.lstm_out_head, axis=2) #self.lstm_out_head_fw = self.lstm_out_head[0] #self.lstm_out_head_bw = self.lstm_out_head[1] #self.lstm_out_merge_head = tf.concat([self.lstm_out_head_fw[-1], self.lstm_out_head_bw[-1]], axis=1) self.lstm_out_expanded_head = tf.expand_dims( self.lstm_out_merge_head, -1) print(self.lstm_out_expanded_head.shape) #output = tf.stack(output, axis=1) #output = tf.reshape(output, [-1, FLAGS.num_units * 2]) with tf.variable_scope("lstm-body") as scope: #self.lstm_cell_body = tf.contrib.rnn.LSTMCell(embedding_size,state_is_tuple=True) #self.lstm_out_body,self.lstm_state_body = tf.nn.dynamic_rnn(self.lstm_cell_body,self.embedded_chars_body,dtype=tf.float32) #self.lstm_out_expanded_body = tf.expand_dims(self.lstm_out_body, -1) self.lstm_out_body, self.lstm_state_body = bi_rnn( GRUCell(embedding_size), GRUCell(embedding_size), inputs=self.embedded_chars_body, dtype=tf.float32) self.lstm_out_merge_body = tf.concat(self.lstm_out_body, axis=2) #self.lstm_out_body_fw = self.lstm_out_body[0] #self.lstm_out_body_bw = self.lstm_out_body[1] #self.lstm_out_merge_body = tf.concat([self.lstm_out_body_fw[-1], self.lstm_out_body_bw[-1]], axis=1) self.lstm_out_expanded_body = tf.expand_dims( self.lstm_out_merge_body, -1) print(self.lstm_out_expanded_body.shape) self.pooled_outputs_head = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-head-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size * 2, 1, 256] W_head = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_head") b_head = tf.Variable(tf.constant(0.1, shape=[256]), name="b_head") conv_head = tf.nn.conv2d(self.lstm_out_expanded_head, W_head, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h_head = tf.nn.relu(tf.nn.bias_add(conv_head, b_head), name="relu_head") # Maxpooling over the outputs pooled_head = tf.nn.max_pool( h_head, ksize=[1, sequence_length_head - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") self.pooled_outputs_head.append(pooled_head) self.pooled_outputs_body = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("conv-maxpool-body-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size * 2, 1, 1024] W_body = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_body") b_body = tf.Variable(tf.constant(0.1, shape=[1024]), name="b_body") conv_body = tf.nn.conv2d(self.lstm_out_expanded_body, W_body, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h_body = tf.nn.relu(tf.nn.bias_add(conv_body, b_body), name="relu_body") # Maxpooling over the outputs pooled_body = tf.nn.max_pool( h_body, ksize=[1, sequence_length_body - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") self.pooled_outputs_body.append(pooled_body) l2_loss = tf.constant(0.0) pooled_outputs = tf.concat( [self.pooled_outputs_head, self.pooled_outputs_body], -1, name='preconcat') print(pooled_outputs.shape) num_filters_total = num_filters * len(filter_sizes) self.h_pool = tf.concat(pooled_outputs, 3, name='concat') self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) W_fc1 = tf.Variable(tf.truncated_normal([1280, 1024], stddev=0.1), name="W_fc1") b_fc1 = tf.Variable(tf.constant(0.1, shape=[1024]), name="b_fc1") h_fc1 = tf.nn.relu(tf.matmul(self.h_pool_flat, W_fc1) + b_fc1) W_fc2 = tf.Variable(tf.truncated_normal([1024, 1024], stddev=0.1), name="W_fc1") b_fc2 = tf.Variable(tf.constant(0.1, shape=[1024]), name="b_fc1") h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2) W_fc3 = tf.Variable(tf.truncated_normal([1024, 1024], stddev=0.1), name="W_fc1") b_fc3 = tf.Variable(tf.constant(0.1, shape=[1024]), name="b_fc1") h_fc3 = tf.nn.relu(tf.matmul(h_fc2, W_fc3) + b_fc3) # Add dropout with tf.name_scope("dropout"): self.h_drop = tf.nn.dropout(h_fc3, self.dropout_keep_prob) # Final (unnormalized) scores and predictions with tf.name_scope("output"): self.W = tf.get_variable( "W", shape=[1024, num_classes], initializer=tf.contrib.layers.xavier_initializer()) self.b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") l2_loss += tf.nn.l2_loss(self.W) l2_loss += tf.nn.l2_loss(self.b) self.scores = tf.nn.xw_plus_b(self.h_drop, self.W, self.b, name="scores") self.probabilities = tf.nn.softmax(self.scores) self.predictions = tf.argmax(self.scores, 1, name="predictions") # CalculateMean cross-entropy loss with tf.name_scope("loss"): print(self.scores.shape) losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss # Accuracy with tf.name_scope("accuracy"): print("%d/%d", self.predictions, self.input_y) correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
with tf.name_scope('Inputs'): batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='batch_ph') target_ph = tf.placeholder(tf.float32, [None], name='target_ph') seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph') # Embedding layer with tf.name_scope('Embedding_layer'): embeddings_var = tf.Variable(tf.random_uniform( [vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True) tf.summary.histogram('embeddings_var', embeddings_var) batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph) # (Bi-)RNN layer(-s) rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32) tf.summary.histogram('RNN_outputs', rnn_outputs) rnn_outputs_cat = tf.concat(rnn_outputs, 2) # Attention layer with tf.name_scope('Attention_layer'): attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True) tf.summary.histogram('alphas', alphas) # Dropout drop = tf.nn.dropout(attention_output, KEEP_PROB)
def __init__(self, config, name_scope, forward_only=False, num_samples=512, dtype=tf.float32): # self.scope_name = scope_name # with tf.variable_scope(self.scope_name): source_vocab_size = config.vocab_size target_vocab_size = config.vocab_size emb_dim = config.emb_dim self.buckets = config.buckets self.learning_rate = tf.Variable(float(config.learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * config.learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.batch_size = config.batch_size self.num_layers = config.num_layers self.max_gradient_norm = config.max_gradient_norm self.mc_search = tf.placeholder(tf.bool, name="mc_search") self.forward_only = tf.placeholder(tf.bool, name="forward_only") self.up_reward = tf.placeholder(tf.bool, name="up_reward") self.reward_bias = tf.get_variable("reward_bias", [1], dtype=tf.float32) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < target_vocab_size: w_t = tf.get_variable("proj_w", [target_vocab_size, emb_dim], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss(weights=local_w_t, biases=local_b, inputs=local_inputs, labels=labels, num_sampled=num_samples, num_classes=target_vocab_size), dtype) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = GRUCell(emb_dim) cell = single_cell if self.num_layers > 1: cell = MultiRNNCell([single_cell] * self.num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return rl_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=emb_dim, output_projection=output_projection, feed_previous=do_decode, mc_search=self.mc_search, dtype=dtype) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(self.buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(self.buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) self.reward = [tf.placeholder(tf.float32, name="reward_%i" % i) for i in range(len(self.buckets))] # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] self.outputs, self.losses, self.encoder_state = rl_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, source_vocab_size, self.batch_size, lambda x, y: seq2seq_f(x, y, tf.where(self.forward_only, True, False)), output_projection=output_projection, softmax_loss_function=softmax_loss_function) for b in xrange(len(self.buckets)): self.outputs[b] = [ tf.cond( self.forward_only, lambda: tf.matmul(output, output_projection[0]) + output_projection[1], lambda: output ) for output in self.outputs[b] ] if not forward_only: with tf.name_scope("gradient_descent"): self.gradient_norms = [] self.updates = [] self.aj_losses = [] self.gen_params = [p for p in tf.trainable_variables() if name_scope in p.name] # opt = tf.train.GradientDescentOptimizer(self.learning_rate) opt = tf.train.AdamOptimizer() for b in xrange(len(self.buckets)): R = tf.subtract(self.reward[b], self.reward_bias) # self.reward[b] = self.reward[b] - reward_bias adjusted_loss = tf.cond(self.up_reward, lambda: tf.subtract(self.losses[b], self.reward[b]), lambda: self.losses[b]) # adjusted_loss = tf.cond(self.up_reward, # lambda: tf.mul(self.losses[b], R), # lambda: self.losses[b]) self.aj_losses.append(adjusted_loss) gradients = tf.gradients(adjusted_loss, self.gen_params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, self.max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, self.gen_params), global_step=self.global_step)) self.gen_variables = [k for k in tf.global_variables() if name_scope in k.name] self.saver = tf.train.Saver(self.gen_variables)
def __init__(self, hidden_size, num_layers, name): self._cell = GRUCell(num_units=hidden_size) self._hidden_size = hidden_size self._num_layers = num_layers self.name = name
def __init__(self, word_dict, embedding_matrix, d_len, q_len, sess, embedding_dim, hidden_size, num_layers, weight_path, use_lstm=False): """ 初始化模型 b ... batch_size t ... d_len f ... hidden_size*2 i ... candidate_len """ self.weight_path = weight_path self.word_dict = word_dict self.vocab_size = len(embedding_matrix) self.d_len = d_len self.q_len = q_len self.sess = sess self.A_len = 10 logging.info("Embedding matrix shape:%d x %d" % (len(embedding_matrix), embedding_dim)) self.rnn_cell = LSTMCell( num_units=hidden_size, ) if use_lstm else GRUCell( num_units=hidden_size) self.cell_name = "LSTM" if use_lstm else "GRU" # 声明词向量矩阵 with tf.device("/cpu:0"): embedding = tf.Variable(initial_value=embedding_matrix, name="embedding_matrix_w", dtype="float32") # 模型的输入输出 self.q_input = tf.placeholder(dtype=tf.int32, shape=(None, self.q_len), name="q_input") self.d_input = tf.placeholder(dtype=tf.int32, shape=(None, self.d_len), name="d_input") self.context_mask_bt = tf.placeholder(dtype=tf.float32, shape=(None, self.d_len), name="context_mask_bt") self.candidates_bi = tf.placeholder(dtype=tf.int32, shape=(None, self.A_len), name="candidates_bi") self.y_true = tf.placeholder(shape=(None, self.A_len), dtype=tf.float32, name="y_true") # 模型输入的长度,每个sample一个长度 shape=(None) d_lens = tf.reduce_sum(tf.sign(tf.abs(self.d_input)), 1) q_lens = tf.reduce_sum(tf.sign(tf.abs(self.q_input)), 1) with tf.variable_scope( 'q_encoder', initializer=tf.contrib.layers.xavier_initializer()): # 问题的编码模型 # output shape: (None, max_q_length, embedding_dim) q_embed = tf.nn.embedding_lookup(embedding, self.q_input) q_cell = MultiRNNCell(cells=[self.rnn_cell] * num_layers) outputs, last_states = tf.nn.bidirectional_dynamic_rnn( cell_bw=q_cell, cell_fw=q_cell, dtype="float32", sequence_length=q_lens, inputs=q_embed, swap_memory=True) # q_encoder output shape: (None, hidden_size * 2) q_encode = tf.concat([last_states[0][-1], last_states[1][-1]], axis=-1) logging.info("q_encode shape {}".format(q_encode.get_shape())) with tf.variable_scope( 'd_encoder', initializer=tf.contrib.layers.xavier_initializer()): # 上下文文档的编码模型 # output shape: (None, max_d_length, embedding_dim) d_embed = tf.nn.embedding_lookup(embedding, self.d_input) d_cell = MultiRNNCell(cells=[self.rnn_cell] * num_layers) outputs, last_states = tf.nn.bidirectional_dynamic_rnn( cell_bw=d_cell, cell_fw=d_cell, dtype="float32", sequence_length=d_lens, inputs=d_embed, swap_memory=True) # d_encoder output shape: (None, max_d_length, hidden_size * 2) d_encode = tf.concat(outputs, axis=-1) logging.info("d_encode shape {}".format(d_encode.get_shape())) def att_dot(x): """注意力点乘函数""" d_btf, q_bf = x res = K.batch_dot(tf.expand_dims(q_bf, -1), d_btf, (1, 2)) return tf.reshape(res, [-1, self.d_len]) with tf.variable_scope('merge'): mem_attention_pre_soft_bt = att_dot([d_encode, q_encode]) mem_attention_pre_soft_masked_bt = tf.multiply( mem_attention_pre_soft_bt, self.context_mask_bt, name="attention_mask") mem_attention_bt = tf.nn.softmax( logits=mem_attention_pre_soft_masked_bt, name="softmax_attention") # 注意力求和,attention-sum过程 def sum_prob_of_word(word_ix, sentence_ixs, sentence_attention_probs): word_ixs_in_sentence = tf.where(tf.equal(sentence_ixs, word_ix)) return tf.reduce_sum( tf.gather(sentence_attention_probs, word_ixs_in_sentence)) # noinspection PyUnusedLocal def sum_probs_single_sentence(prev, cur): candidate_indices_i, sentence_ixs_t, sentence_attention_probs_t = cur result = tf.scan(fn=lambda previous, x: sum_prob_of_word( x, sentence_ixs_t, sentence_attention_probs_t), elems=[candidate_indices_i], initializer=tf.constant(0., dtype="float32")) return result def sum_probs_batch(candidate_indices_bi, sentence_ixs_bt, sentence_attention_probs_bt): result = tf.scan(fn=sum_probs_single_sentence, elems=[ candidate_indices_bi, sentence_ixs_bt, sentence_attention_probs_bt ], initializer=tf.Variable([0] * self.A_len, dtype="float32")) return result # 注意力求和,output shape: (None, i) i = max_candidate_length = 10 self.y_hat = sum_probs_batch(self.candidates_bi, self.d_input, mem_attention_bt) # 交叉熵损失函数 output = self.y_hat / tf.reduce_sum( self.y_hat, reduction_indices=len(self.y_hat.get_shape()) - 1, keep_dims=True) # manual computation of crossentropy epsilon = tf.convert_to_tensor(_EPSILON, output.dtype.base_dtype, name="epsilon") output = tf.clip_by_value(output, epsilon, 1. - epsilon) self.loss = tf.reduce_mean( -tf.reduce_sum(self.y_true * tf.log(output), reduction_indices=len(output.get_shape()) - 1)) # 计算准确率 self.correct_prediction = tf.reduce_sum( tf.sign( tf.cast( tf.equal(tf.argmax(self.y_hat, 1), tf.argmax(self.y_true, 1)), "float"))) # 模型序列化工具 self.saver = tf.train.Saver()
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, stop_token_targets=None, global_step=None): with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embed_depth = 512 embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # Encoder编码器模块(prenet网络和cbhg网络) prenet_outputs = prenet(embedded_inputs, is_training, hp.prenet_depths) # prenet_depths = [256, 256] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training, hp.encoder_depth) # encoder_depth = 256 # 位置敏感注意力机制(attention_depth = 128) attention_mechanism = LocationSensitiveAttention(hp.attention_depth, encoder_outputs) # 解码器RNN(两层残差门控循环单元,decoder_depth = 1024) multi_rnn_cell = MultiRNNCell([ ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # 帧投影层(80*5) frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step) # 停止层(包含停止符,5) stop_projection = StopProjection(is_training, shape=hp.outputs_per_step) # 解码器单元 decoder_cell = TacotronDecoderWrapper(is_training, attention_mechanism, multi_rnn_cell, frame_projection, stop_projection) if is_training: # 训练 helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step, global_step) else: # 使用停止符进行预测 helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) # 解码器初始化状态 decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, stop_token_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( CustomDecoder(decoder_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # 80*5 # 调整梅尔数组大小:从 80*5 到 80 mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) stop_token_outputs = tf.reshape(stop_token_outputs, [batch_size, -1]) # 后处理网络(postnet_depth = 512) post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # num_freq = 2049 # 从最终解码器状态获得对齐情况 alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.stop_token_outputs = stop_token_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.stop_token_targets = stop_token_targets
def define_sequence_model(self): seed = 12345 np.random.seed(12345) layer_list = [] with self.graph.as_default() as g: utt_length = tf.placeholder(tf.int32, shape=(None)) g.add_to_collection(name="utt_length", value=utt_length) with tf.name_scope("input"): input_layer = tf.placeholder(dtype=tf.float32, shape=(None, None, self.n_in), name="input_layer") if self.dropout_rate != 0.0: print "Using dropout to avoid overfitting and the dropout rate is", self.dropout_rate is_training_drop = tf.placeholder(dtype=tf.bool, shape=(), name="is_training_drop") input_layer_drop = dropout(input_layer, self.dropout_rate, is_training=is_training_drop) layer_list.append(input_layer_drop) g.add_to_collection(name="is_training_drop", value=is_training_drop) else: layer_list.append(input_layer) g.add_to_collection("input_layer", layer_list[0]) with tf.name_scope("hidden_layer"): basic_cell = [] if "tanh" in self.hidden_layer_type: is_training_batch = tf.placeholder( dtype=tf.bool, shape=(), name="is_training_batch") bn_params = { "is_training": is_training_batch, "decay": 0.99, "updates_collections": None } g.add_to_collection("is_training_batch", is_training_batch) for i in xrange(len(self.hidden_layer_type)): if self.dropout_rate != 0.0: if self.hidden_layer_type[i] == "tanh": new_layer = fully_connected( layer_list[-1], self.hidden_layer_size[i], activation_fn=tf.nn.tanh, normalizer_fn=batch_norm, normalizer_params=bn_params) new_layer_drop = dropout( new_layer, self.dropout_rate, is_training=is_training_drop) layer_list.append(new_layer_drop) if self.hidden_layer_type[i] == "lstm": basic_cell.append( MyDropoutWrapper(BasicLSTMCell( num_units=self.hidden_layer_size[i]), self.dropout_rate, self.dropout_rate, is_training=is_training_drop)) if self.hidden_layer_type[i] == "gru": basic_cell.append( MyDropoutWrapper(GRUCell( num_units=self.hidden_layer_size[i]), self.dropout_rate, self.dropout_rate, is_training=is_training_drop)) else: if self.hidden_layer_type[i] == "tanh": new_layer = fully_connected( layer_list[-1], self.hidden_layer_size[i], activation_fn=tf.nn.tanh, normalizer_fn=batch_norm, normalizer_params=bn_params) layer_list.append(new_layer) if self.hidden_layer_type[i] == "lstm": basic_cell.append( LayerNormBasicLSTMCell( num_units=self.hidden_layer_size[i])) if self.hidden_layer_type[i] == "gru": basic_cell.append( LayerNormGRUCell( num_units=self.hidden_layer_size[i])) multi_cell = MultiRNNCell(basic_cell) rnn_outputs, rnn_states = tf.nn.dynamic_rnn( multi_cell, layer_list[-1], dtype=tf.float32, sequence_length=utt_length) layer_list.append(rnn_outputs) with tf.name_scope("output_layer"): if self.output_type == "linear": output_layer = tf.layers.dense(rnn_outputs, self.n_out) # stacked_rnn_outputs=tf.reshape(rnn_outputs,[-1,self.n_out]) # stacked_outputs=tf.layers.dense(stacked_rnn_outputs,self.n_out) # output_layer=tf.reshape(stacked_outputs,[-1,utt_length,self.n_out]) g.add_to_collection(name="output_layer", value=output_layer) with tf.name_scope("training_op"): if self.optimizer == "adam": self.training_op = tf.train.AdamOptimizer()
def __init__(self, num_items, num_embed_units, num_units, num_layers, vocab=None, embed=None, learning_rate=5e-4, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, use_lstm=True): self.epoch = tf.Variable(0, trainable=False, name='env/epoch') self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.sessions_input = tf.placeholder(tf.int32, shape=(None, None)) self.rec_lists = tf.placeholder(tf.int32, shape=(None, None, None)) self.rec_mask = tf.placeholder(tf.float32, shape=(None, None, None)) self.aims_idx = tf.placeholder(tf.int32, shape=(None, None)) self.sessions_length = tf.placeholder(tf.int32, shape=(None)) self.purchase = tf.placeholder(tf.int32, shape=(None, None)) if embed is None: self.embed = tf.get_variable( 'env/embed', [num_items, num_embed_units], tf.float32, initializer=tf.initializers.truncated_normal(0, 1)) else: self.embed = tf.get_variable('env/embed', dtype=tf.float32, initializer=embed) batch_size, encoder_length, rec_length = tf.shape( self.sessions_input)[0], tf.shape( self.sessions_input)[1], tf.shape(self.rec_lists)[2] encoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.sessions_length - 2, encoder_length), reverse=True, axis=1), [-1, encoder_length]) self.encoder_input = tf.nn.embedding_lookup( self.embed, self.sessions_input) #batch*len*unit self.aims = tf.one_hot(self.aims_idx, rec_length) if use_lstm: cell = MultiRNNCell( [LSTMCell(num_units) for _ in range(num_layers)]) else: cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # Training with tf.variable_scope("env"): # [batch_size, length, num_units] encoder_output, _ = dynamic_rnn(cell, self.encoder_input, self.sessions_length, dtype=tf.float32, scope="encoder") # [batch_size, length, embed_units] preference = tf.layers.dense(encoder_output, num_embed_units, name="pref_output") # [batch_size, length, rec_length, embed_units] self.candidate = tf.reshape( tf.gather_nd(self.embed, tf.expand_dims(self.rec_lists, 3)), [batch_size, encoder_length, rec_length, num_embed_units]) # [batch_size, length, rec_length] logits = tf.reduce_mean( tf.multiply(tf.expand_dims(preference, 2), self.candidate), 3) mul_prob = tf.nn.softmax(logits) * self.rec_mask # [batch_size, length, rec_length] self.norm_prob = mul_prob / ( tf.expand_dims(tf.reduce_sum(mul_prob, 2), 2) + 1e-20) # [batch_size, length, metric_num] _, self.argmax_index = tf.nn.top_k(self.norm_prob, k=FLAGS.metric + 1) local_predict_loss = tf.reduce_sum( -self.aims * tf.log(self.norm_prob + 1e-20), 2) * encoder_mask self.predict_loss = tf.reduce_sum( local_predict_loss) / tf.reduce_sum(encoder_mask) # [batch_size, length, embed_units] aim_embed = tf.reduce_sum( tf.expand_dims(self.aims, 3) * self.candidate, 2) # [batch_size, length, 2] self.purchase_prob = tf.nn.softmax( tf.layers.dense(tf.multiply( tf.layers.dense(tf.stop_gradient(encoder_output), num_units, name="purchase_layer"), tf.layers.dense(tf.stop_gradient(aim_embed), num_units, name="purchase_aim")), 2, name="purchase_projection")) local_purchase_loss = tf.reduce_sum( -tf.one_hot(self.purchase, 2) * tf.log(self.purchase_prob + 1e-20), 2) * encoder_mask * tf.pow( tf.cast(self.purchase, tf.float32) + 1, 5.3) self.purchase_loss = tf.reduce_sum( local_purchase_loss) / tf.reduce_sum(encoder_mask) self.decoder_loss = self.predict_loss + self.purchase_loss self.score = tf.placeholder(tf.float32, (None, None)) self.score_loss = tf.reduce_sum( self.score * (local_predict_loss + local_purchase_loss)) / tf.reduce_sum(encoder_mask) # Inference with tf.variable_scope("env", reuse=True): # tf.get_variable_scope().reuse_variables() # [batch_size, length, embed_units] inf_preference = tf.expand_dims( tf.layers.dense(encoder_output[:, -1, :], num_embed_units, name="pref_output"), 1) # [batch_size, 1, rec_length, embed_units] self.inf_candidate = tf.reshape( tf.gather_nd(self.embed, tf.expand_dims(self.rec_lists, 3)), [batch_size, 1, rec_length, num_embed_units]) # [batch_size, 1, rec_length] inf_logits = tf.reduce_mean( tf.multiply(tf.expand_dims(inf_preference, 2), self.inf_candidate), 3) inf_mul_prob = tf.nn.softmax(inf_logits) * self.rec_mask self.inf_norm_prob = inf_mul_prob / ( tf.expand_dims(tf.reduce_sum(inf_mul_prob, 2), 2) + 1e-20) # [batch_size, 1, metric_num] _, self.inf_argmax_index = tf.nn.top_k(self.inf_norm_prob, k=FLAGS.metric) def gumbel_max(inp, alpha, beta): # assert len(tf.shape(inp)) == 2 g = tf.random_uniform(tf.shape(inp), 0.0001, 0.9999) g = -tf.log(-tf.log(g)) inp_g = tf.nn.softmax( (tf.nn.log_softmax(inp / 1.0) + g * alpha) * beta) return inp_g # [batch_size, action_num] _, self.inf_random_index = tf.nn.top_k(gumbel_max( tf.log(self.inf_norm_prob + 1e-12), 1, 1), k=FLAGS.action_num) inf_aim_embed = tf.reduce_sum( tf.cast( tf.reshape( tf.one_hot(self.inf_argmax_index[:, :, 0], rec_length), [batch_size, 1, rec_length, 1]), tf.float32) * self.inf_candidate, 2) # [batch_size, 1, 2] self.inf_purchase_prob = tf.nn.softmax( tf.layers.dense(tf.multiply( tf.layers.dense(tf.stop_gradient(encoder_output), num_units, name="purchase_layer"), tf.layers.dense(tf.stop_gradient(inf_aim_embed), num_units, name="purchase_aim")), 2, name="purchase_projection")) self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) opt = tf.train.AdamOptimizer(self.learning_rate) self.params = tf.trainable_variables() # For pretraining gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) # For adversarial training score_gradients = tf.gradients(self.score_loss, self.params) score_clipped_gradients, self.score_gradient_norm = tf.clip_by_global_norm( score_gradients, max_gradient_norm) self.score_update = opt.apply_gradients(zip(score_clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=10, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def Tensor_Generate(self): #I think this line do not need anymore because TF 1.x does not support 16bit well. if pattern_Parameters.Pattern_Use_Bit == 16: float_Bit_Type = tf.float16 int_Bit_Type = tf.int16 elif pattern_Parameters.Pattern_Use_Bit == 32: float_Bit_Type = tf.float32 int_Bit_Type = tf.int32 else: assert False placeholder_Dict = self.pattern_Feeder.placeholder_Dict #Placeholder is variable space. All patterns are inputted by placeholder with tf.variable_scope('EARS') as scope: #Variable name managing. batch_Size = tf.shape(placeholder_Dict["Acoustic"])[ 0] #Getting a batch size of current pattern input_Activation = placeholder_Dict[ "Acoustic"] #input is acoustic pattern conv_Parameters = enumerate( zip(model_Parameters.Prenet_Conv.Channels, model_Parameters.Prenet_Conv.Kernel_Sizes, model_Parameters.Prenet_Conv.Strides) ) #Getting convolution parameters from hyper parameters if model_Parameters.Prenet_Conv.Use: #Prenet(Conv) is used only user set Conv.Use = True for conv_Index, ( channel, kernel_Size, stride) in conv_Parameters: #Conv layer count for loop with tf.variable_scope( 'Prenet_Conv_{}'.format(conv_Index)): input_Activation = tf.layers.conv1d( #Calculating convolution inputs=input_Activation, filters=channel, kernel_size=kernel_Size, strides=stride, padding='same', activation=tf.nn.relu) input_Activation = tf.layers.batch_normalization( #Calculating batch normalization for regularization inputs=input_Activation, training=placeholder_Dict["Is_Training"]) if not model_Parameters.Prenet_Conv.Dropout_Rate is None: input_Activation = tf.layers.dropout( #Dropout applied for regularization input_Activation, rate=model_Parameters.Prenet_Conv.Dropout_Rate, training=placeholder_Dict["Is_Training"]) #This model use only training helper.(Ground truth) helper = TrainingHelper( #Helper decides RNN calculation rule at each time step inputs=placeholder_Dict["Acoustic"], sequence_length=placeholder_Dict["Length"]) #RNN. Model can select four types hidden. #Previous RNN state is for the no reset. if model_Parameters.Hidden_Type in ["LSTM", 'ZoneoutLSTM']: if model_Parameters.Hidden_Type == "LSTM": rnn_Cell = LSTMCell( model_Parameters.Hidden_Size) #Setting LSTM Cell elif model_Parameters.Hidden_Type == "ZoneoutLSTM": rnn_Cell = ZoneoutLSTMCell( #Setting ZoneoutLSTMCell num_units=model_Parameters.Hidden_Size, is_training=placeholder_Dict["Is_Training"], cell_zoneout_rate=model_Parameters.Zoneout_Rate, output_zoneout_rate=model_Parameters.Zoneout_Rate) previous_RNN_State = tf.Variable( #Stroage for RNN states. LSTM and ZoneoutLSTM need two states(c, h). Initially, they become zero vectors. initial_value=LSTMStateTuple( c=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size)), h=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size))), trainable=False, dtype=float_Bit_Type) decoder_Initial_State = LSTMStateTuple( #Setting the RNN states c=previous_RNN_State[0][:batch_Size], h=previous_RNN_State[1][:batch_Size]) elif model_Parameters.Hidden_Type == "SCRN": rnn_Cell = SCRNCell(model_Parameters.Hidden_Size) previous_RNN_State = tf.Variable( #Stroage for RNN states. SCRN needs two states(s, h). Initially, it becomes zero vectors. initial_value=SCRNStateTuple( s=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size)), h=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size))), trainable=False, dtype=float_Bit_Type) decoder_Initial_State = SCRNStateTuple( #Setting the RNN states s=previous_RNN_State[0][:batch_Size], h=previous_RNN_State[1][:batch_Size]) elif model_Parameters.Hidden_Type in ["GRU", "BPTT"]: if model_Parameters.Hidden_Type == "GRU": rnn_Cell = GRUCell(model_Parameters.Hidden_Size) elif model_Parameters.Hidden_Type == "BPTT": rnn_Cell = BasicRNNCell(model_Parameters.Hidden_Size) previous_RNN_State = tf.Variable( #Stroage for RNN states. initial_value=tf.zeros( shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size)), trainable=False, dtype=float_Bit_Type) decoder_Initial_State = previous_RNN_State[: batch_Size] #Setting the RNN states decoder = BasicDecoder( #Decoder conduct RNN calculation by Helper's rule cell=rnn_Cell, helper=helper, initial_state=decoder_Initial_State) outputs, final_State, _ = dynamic_decode( #Calculating hidden activation. decoder=decoder, output_time_major=False, impute_finished=True) hidden_Activation = outputs.rnn_output #Getting hidden activation. #Semantic (hidden_size -> semantic_size) semantic_Logits = tf.layers.dense( #H->O calculation inputs=hidden_Activation, units=self.pattern_Feeder.semantic_Size, use_bias=True, name="semantic_Logits") #Back-prob. with tf.variable_scope('training_Loss') as scope: loss_Mask = tf.sequence_mask( placeholder_Dict["Length"], dtype=tf.float32 ) #By the pattern length, zero padded location is masked. They cannot affect weight update. loss_Calculation = tf.nn.sigmoid_cross_entropy_with_logits( #Calculation the error between target and output labels=placeholder_Dict["Semantic"], #Target logits=semantic_Logits #Output ) loss_Calculation = tf.reduce_mean(loss_Calculation, axis=-1) loss_Calculation *= loss_Mask #Masking loss = tf.reduce_sum(loss_Calculation) if model_Parameters.Weight_Regularization.Use: #A method for regularization. If using weight regularization. loss += model_Parameters.Weight_Regularization.Rate * tf.reduce_sum( [ #All values of each weights get small pressure for making they have same value. tf.nn.l2_loss(variable) for variable in tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES) if not any([ keyword.lower() in variable.name.lower() for keyword in model_Parameters. Weight_Regularization.Except_Keywords ]) ]) loss_Display = tf.reduce_sum( loss_Calculation, axis=0) / tf.math.count_nonzero( loss_Calculation, axis=0, dtype=tf.float32 ) #This is for the display. There is no meaning. global_Step = tf.Variable( 0, name='global_Step', trainable=False ) #Global step means the trained batch, not epoch. This is used at learning rate decaying. ##Noam decay of learning rate step = tf.cast(global_Step + 1, dtype=float_Bit_Type) warmup_Steps = 4000.0 learning_Rate = model_Parameters.Learning_Rate * warmup_Steps**0.5 * tf.minimum( step * warmup_Steps**-1.5, step**-0.5) #Static(Temp) #learning_Rate = tf.cast(model_Parameters.Learning_Rate, float_Bit_Type) #Weight update. We use the ADAM optimizer optimizer = tf.train.AdamOptimizer( learning_Rate) #Generating ADAM optimizer gradients, variables = zip(*optimizer.compute_gradients(loss)) clipped_Gradients, global_Norm = tf.clip_by_global_norm( gradients, 1.0 ) #Suppressing the gradient to prevent explosion occurs by that too large a value is applied to the weight update. optimize = optimizer.apply_gradients( zip(clipped_Gradients, variables), global_step=global_Step) #Weight update #For no reset. Model save the rnn states. if model_Parameters.Hidden_Type in ["LSTM", 'ZoneoutLSTM']: rnn_State_Assign = tf.assign( ref=previous_RNN_State, value=LSTMStateTuple(c=tf.concat([ final_State[0][:batch_Size], previous_RNN_State[0][batch_Size:] ], axis=0), h=tf.concat([ final_State[1][:batch_Size], previous_RNN_State[1][batch_Size:] ], axis=0))) if model_Parameters.Hidden_Type == "SCRN": rnn_State_Assign = tf.assign( ref=previous_RNN_State, value=SCRNStateTuple(s=tf.concat([ final_State[0][:batch_Size], previous_RNN_State[0][batch_Size:] ], axis=0), h=tf.concat([ final_State[1][:batch_Size], previous_RNN_State[1][batch_Size:] ], axis=0))) elif model_Parameters.Hidden_Type in ["GRU", "BPTT"]: rnn_State_Assign = tf.assign( ref=previous_RNN_State, value=tf.concat([ final_State[:batch_Size], previous_RNN_State[batch_Size:] ], axis=0)) with tf.variable_scope('test') as scope: #In test, if user want, previous hidden state will be zero. Thus, the saved values should be backup and become zero. if model_Parameters.Hidden_Type in ["LSTM", 'ZoneoutLSTM']: backup_RNN_State = tf.Variable(initial_value=LSTMStateTuple( c=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size)), h=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size))), trainable=False, dtype=float_Bit_Type) elif model_Parameters.Hidden_Type == "SCRN": backup_RNN_State = tf.Variable(initial_value=SCRNStateTuple( s=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size)), h=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size))), trainable=False, dtype=float_Bit_Type) elif model_Parameters.Hidden_Type in ["GRU", "BPTT"]: backup_RNN_State = tf.Variable(initial_value=tf.zeros( shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size)), trainable=False, dtype=float_Bit_Type) backup_RNN_State_Assign = tf.assign(ref=backup_RNN_State, value=previous_RNN_State) with tf.control_dependencies([backup_RNN_State_Assign]): if model_Parameters.Hidden_Type in ["LSTM", 'ZoneoutLSTM']: zero_RNN_State_Assign = tf.assign( ref=previous_RNN_State, value=LSTMStateTuple( c=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size), dtype=float_Bit_Type), h=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size), dtype=float_Bit_Type))) elif model_Parameters.Hidden_Type == "SCRN": zero_RNN_State_Assign = tf.assign( ref=previous_RNN_State, value=LSTMStateTuple( s=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size), dtype=float_Bit_Type), h=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size), dtype=float_Bit_Type))) elif model_Parameters.Hidden_Type in ["GRU", "BPTT"]: zero_RNN_State_Assign = tf.assign( ref=previous_RNN_State, value=tf.zeros(shape=(model_Parameters.Batch_Size, model_Parameters.Hidden_Size), dtype=float_Bit_Type)) restore_RNN_State_Assign = tf.assign(ref=previous_RNN_State, value=backup_RNN_State) semantic_Activation = tf.nn.sigmoid(semantic_Logits) self.training_Tensor_List = [ global_Step, learning_Rate, loss_Display, optimize, rnn_State_Assign ] #Setting return variables when training self.test_Mode_Turn_On_Tensor_List = [ backup_RNN_State_Assign, zero_RNN_State_Assign ] #Hidden state backup and all initial state become zero vectors. self.test_Mode_Turn_Off_Tensor_List = [restore_RNN_State_Assign ] #Hidden state restore self.test_Tensor_List = [global_Step, semantic_Activation ] #In test, we only need semantic activation self.hidden_Plot_Tensor_List = [ tf.transpose(hidden_Activation, perm=[0, 2, 1]) ] #In hidden analysis, we only need hidden activation. self.tf_Session.run( tf.global_variables_initializer() ) #Initialize the weights. Until this code run, in Tensorflow, there is no weight.
n_iterations = 10000 batch_size = 50 X = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) y = tf.placeholder(tf.float32, [None, n_steps, n_outputs]) # 现在在每个时间迭代,有一个大小为100的输出向量,但是实际上我们需要一个单独的输出值。 # 最简单的解决方案是将单元格包装在OutputProjectionWrapper中。 # cell = OutputProjectionWrapper(BasicRNNCell(num_units=n_neurous, activation=tf.nn.relu), output_size=n_outputs) # 用技巧提高速度 # cell = BasicRNNCell(num_units=n_neurous, activation=tf.nn.relu) # multi_layer_cell = MultiRNNCell([cell] * n_layers) layers = [ GRUCell(num_units=n_neurous, activation=tf.nn.relu) for _ in range(n_layers) ] multi_layer_cell = MultiRNNCell(layers) rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32) stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurous]) stacked_outputs = fully_connected(stacked_rnn_outputs, n_outputs, activation_fn=None) outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs]) loss = tf.reduce_mean(tf.square(outputs - y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) training_op = optimizer.minimize(loss) init = tf.global_variables_initializer()
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training = linear_targets is not None self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, embedding_size] char_embedded_inputs = \ tf.nn.embedding_lookup(char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed(speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway") encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \ for idx in range(hp.dec_layer_num)] else: deep_dense = lambda x, dim: \ tf.layers.dense(x, dim, activation=tf.nn.softsign) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type)) else: speaker_embed = None before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet(char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet') encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) dec_prenet_outputs = DecoderPrenetWrapper( GRUCell(hp.attention_state_size), speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention(hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs) elif hp.attention_type.startswith('ntm2'): shift_width = int(hp.attention_type.split('-')[-1]) attention_mechanism = NTMAttention2(hp.attention_size, encoder_outputs, shift_width=shift_width) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) attention_cell = AttentionWrapper( dec_prenet_outputs, attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] concat_cell = ConcatOutputAndAttentionWrapper( attention_cell, embed_to_concat=speaker_embed) # Decoder (layers specified bottom to top): cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)] for _ in range(hp.dec_layer_num): cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception(" [!] Shape {} and {} should be equal". \ format(shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = \ tf.concat([tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
tf.reset_default_graph() x = np.random.randn(2, 4, 5) print(x) # x[1, 1:] = 0 print(x) seq_lengths = [4, 4] #分别建立一个lstm和gru的cell,比较输出的状态 cell = BasicLSTMCell(num_units=3, state_is_tuple=True) gru = GRUCell(3) outputs, last_states, = tf.nn.dynamic_rnn(cell, x, seq_lengths, dtype=tf.float64) gruoutput, grulast_states = tf.nn.dynamic_rnn(gru, x, seq_lengths, dtype=tf.float64) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer())
def initialize(self, inputs, input_lengths, mel_targets=None, mel_lengths=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N, T_in, embed_depth=256] # Encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, # [N, T_in, encoder_depth=256] hp.encoder_depth) if hp.use_vae: style_embeddings, mu, log_var = VAE(inputs=mel_targets, input_lengths=mel_lengths, filters=hp.filters, kernel_size=(3, 3), strides=(2, 2), num_units=hp.vae_dim, is_training=is_training, scope='vae') self.mu = mu self.log_var = log_var style_embeddings = tf.layers.dense(style_embeddings, hp.encoder_depth) style_embeddings = tf.expand_dims(style_embeddings, axis=1) style_embeddings = tf.tile( style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 256] encoder_outputs = encoder_outputs + style_embeddings # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, attention_depth=256] # Apply prenet before concatenation in AttentionWrapper. attention_cell = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 2*attention_depth=512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell( [ OutputProjectionWrapper(concat_cell, hp.decoder_depth), ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N, T_in, decoder_depth=256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg( mel_outputs, hp.num_mels, is_training, # [N, T_out, postnet_depth=256] hp.postnet_depth) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.mel_lengths = mel_lengths self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
num_samples = tf.shape(inputs)[0] # useful for later # Embedding weights We = np.random.randn(V, embedding_dim).astype(np.float32) #Output params Wo = init_weight(hidden_layer_size, K).astype(np.float32) bo = np.zeros(K).astype(np.float32) #Creating tensorflow variables tfWe = tf.Variable(We) tfWo = tf.Variable(Wo) tfbo = tf.Variable(bo) # Building the RNN unit - Using the GRU RNN rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) # Outputs from Embedding Layer x = tf.nn.embedding_lookup(tfWe, inputs) x = tf.unstack(x, sequence_length, 1) #Outputs from RNN Layer outputs, states = get_rnn_output(rnn_unit, x, dtype=tf.float32) outputs = tf.transpose(outputs, (1, 0, 2)) outputs = tf.reshape( outputs, (sequence_length * num_samples, hidden_layer_size)) # NT x M # Building the dense layer logits = tf.matmul(outputs, tfWo) + tfbo # NT x K predictions = tf.argmax(logits, 1) predict_op = tf.reshape(predictions, (num_samples, sequence_length))
def build_policy_raw_rnn(hyper_parms, batch_size): rnn_inputs = hyper_parms['model_input'] policy_rnn_cell_num = hyper_parms['policy_rnn_cell_num'] policy_rnn_type = hyper_parms['policy_rnn_type'] sequence_length = hyper_parms['sequence_len'] assigned_seg_act = hyper_parms['assigned_seg_act'] policy_rnn_layer_num = hyper_parms['policy_rnn_layer_num'] reproduce_policy = hyper_parms['reproduce_policy'] greedy_policy = hyper_parms['greedy_policy'] cells = [] for _ in range(policy_rnn_layer_num): if policy_rnn_type == 'gru': rnn_cell = GRUCell(policy_rnn_cell_num) elif policy_rnn_type == 'lstm': #rnn_cell = LayerNormBasicLSTMCell(policy_rnn_cell_num) rnn_cell = LSTMCell(policy_rnn_cell_num) else: raise ValueError('RNN type should be LSTM or GRU') cells.append(rnn_cell) cell = tf.contrib.rnn.MultiRNNCell(cells) cell = PolicyRNNCell(cell) inputs = transpose_batch_time(rnn_inputs) inputs_ta = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True) inputs_ta = inputs_ta.unstack(inputs) seg_act = transpose_batch_time(assigned_seg_act) seg_act_ta = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True) seg_act_ta = seg_act_ta.unstack(seg_act) loop_state_ta = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True) def loop_fn(time, cell_output, cell_state, loop_state): #check whether is initial condition if cell_output is None: # time == 0 next_cell_state = cell.zero_state(batch_size, tf.float32) else: next_cell_state = cell_state #check whether finished elements_finished = (time >= tf.cast(sequence_length, tf.int32)) finished = tf.reduce_all(elements_finished) #decide action if cell_output is None: next_loop_state = loop_state_ta else: action = tf.cond( reproduce_policy, lambda: seg_act_ta.read(time), lambda: tf. multinomial(tf.log(cell_output), 1, output_dtype=tf.int32)) action = tf.cond( greedy_policy, lambda: tf.expand_dims( tf.argmax(cell_output, axis=1, output_type=tf.int32), 1), lambda: action) next_input = tf.cond( finished, lambda: tf.zeros( [batch_size, rnn_inputs.get_shape()[-1]], dtype=tf.float32), lambda: inputs_ta.read(time)) emit_output = cell_output # == None for time == 0 #writing the action into loop state if cell_output == None: # time == 0 next_loop_state = loop_state_ta else: next_loop_state = loop_state.write(time - 1, action) return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) outputs_ta, _, loop_state_ta = tf.nn.raw_rnn(cell, loop_fn) outputs = convert_raw_rnn_ta_to_tensor(outputs_ta) sampled_actions = convert_raw_rnn_ta_to_tensor(loop_state_ta) return outputs, sampled_actions