def bi_gru_layer( layer_sizes: List[int], input: tf.Variable, input_length: tf.Variable, dropout_function: Callable[[tf.Variable], tf.Variable] = None, parallel_iterations: int = 64 ) -> Tuple[tf.Variable, tf.Variable, tf.Variable]: curr_input, fw_final, bw_final = input, None, None for i, layer_size in enumerate(layer_sizes): with tf.variable_scope('bigru_{}'.format(i)) as scope: if dropout_function is not None and i is not 0: curr_input = dropout_function(curr_input) fw_cell = GRUCell(layer_size) bw_cell = GRUCell(layer_size) (fw_out, bw_out), (fw_final, bw_final) = bidirectional_dynamic_rnn( fw_cell, bw_cell, inputs=curr_input, dtype=tf.float32, sequence_length=input_length, scope=scope, parallel_iterations=parallel_iterations, swap_memory=True) curr_input = tf.concat([fw_out, bw_out], axis=2) return curr_input, fw_final, bw_final
def rnet_matching_layer(layer_size: int, att_size: int, par_vecs: tf.Variable, qu_vecs: tf.Variable, par_num_words: tf.Variable, parallel_iterations: int = 64) -> tf.Variable: with tf.variable_scope('alignment_par_qu') as scope: with tf.variable_scope('fw/match_rnn_cell/attention'): fw_cell = MatchRNNCell(GRUCell(layer_size), qu_vecs, att_size) with tf.variable_scope('bw/match_rnn_cell/attention'): bw_cell = MatchRNNCell(GRUCell(layer_size), qu_vecs, att_size) (fw_out, bw_out), (_, _) = bidirectional_dynamic_rnn( fw_cell, bw_cell, inputs=par_vecs, dtype=tf.float32, sequence_length=par_num_words, scope=scope, parallel_iterations=parallel_iterations, swap_memory=True) match_par_qu_out = tf.concat([fw_out, bw_out], axis=2) return match_par_qu_out
def rnet_self_matching_layer_real( layer_size: int, att_size: int, par_vecs: tf.Variable, par_num_words: tf.Variable, parallel_iterations: int = 64) -> tf.Variable: with tf.variable_scope('alignment_self') as scope: WP = tf.get_variable('WP', [2 * layer_size, att_size]) WPtilde = tf.get_variable('WPtilde', [2 * layer_size, att_size]) v = tf.get_variable('v', [att_size]) att_match_input = tf.einsum('ijk,kl->ijl', par_vecs, WPtilde) with tf.variable_scope('fw/match_rnn_cell/attention'): fw_cell = MatchRNNCellV2(GRUCell(layer_size), WP, v, par_vecs, att_match_input) with tf.variable_scope('bw/match_rnn_cell/attention'): bw_cell = MatchRNNCellV2(GRUCell(layer_size), WP, v, par_vecs, att_match_input) (fw_out, bw_out), (_, _) = bidirectional_dynamic_rnn( fw_cell, bw_cell, inputs=par_vecs, dtype=tf.float32, sequence_length=par_num_words, scope=scope, parallel_iterations=parallel_iterations, swap_memory=True) match_self_out = tf.concat([fw_out, bw_out], axis=2) return match_self_out
def build_model(self): temp = self.all_sequence[-1] with tf.variable_scope("lstm"): temp = dropout(temp, 0.1) seq_len = tf.reduce_sum(self.sent_mask, axis=1) gru_fw = GRUCell(num_units=768, activation=tf.tanh) gru_bw = GRUCell(num_units=768, activation=tf.tanh) outputs, output_states = bidirectional_dynamic_rnn( gru_fw, gru_bw, temp, sequence_length=seq_len, dtype=tf.float32) gru_output = tf.concat(outputs, axis=2) # gru_output = dropout(gru_output, 0.1) gru_output = tf.layers.dense(gru_output, units=768, kernel_initializer=create_initializer(0.02)) gru_output = dropout(gru_output, 0.1) outputs = layer_norm(gru_output + temp) in_outputs = tf.layers.dense(outputs, units=768, activation=tf.tanh, kernel_initializer=create_initializer(0.02)) layer_output = tf.layers.dense(in_outputs, 768, kernel_initializer=create_initializer(0.02)) layer_output = dropout(layer_output, 0.1) layer_output = layer_norm(layer_output + outputs) return layer_output
def biGRU(input, input_length, params, dropout=None, layers=None): dropout = dropout or params.dropout cell_fw = MultiRNNCell([ DropoutWrapper( GRUCell(params.units), # output_keep_prob=1.0 - dropout, input_keep_prob=1.0 - dropout, state_keep_prob=1.0 - dropout, variational_recurrent=True, dtype=tf.float32, input_size=input.get_shape()[-1] if layer == 0 else tf.TensorShape(params.units)) for layer in range(layers or params.layers) ]) cell_bw = MultiRNNCell([ DropoutWrapper( GRUCell(params.units), # output_keep_prob=1.0 - dropout, input_keep_prob=1.0 - dropout, state_keep_prob=1.0 - dropout, variational_recurrent=True, dtype=tf.float32, input_size=input.get_shape()[-1] if layer == 0 else tf.TensorShape(params.units)) for layer in range(layers or params.layers) ]) output, states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, input, sequence_length=input_length, dtype=tf.float32) output = tf.concat(output, -1) return output, states
def _char_embedding_layer( embedder: EmbeddingService, chars: tf.Variable, num_words: tf.Variable, num_chars: tf.Variable, char_rnn_size: int, dropout_function: Callable[[tf.Variable], tf.Variable]) -> tf.Variable: batch_size = int(chars.get_shape()[0]) embedding_size = embedder.embedding_dim with tf.variable_scope('char_embedding_layer'): # [batch_size, dim_num_words, dim_num_chars] char_embeddings = tf.get_variable(name='char_embeddings', trainable=True, dtype=tf.float32, initializer=tf.constant( embedder.embedding_matrix, dtype=tf.float32)) char_raw_embed = dropout_function( tf.nn.embedding_lookup(char_embeddings, chars)) # we need to unstack instead of reshape as two dimension are unknown # batch_size * [dim_num_words, dim_num_chars, embedding_size] char_raw_embed_list = tf.unstack(char_raw_embed, batch_size, axis=0) char_raw_embed_length_list = tf.unstack(num_chars, batch_size, axis=0) # batch_size * [dim_num_words, layer_size] char_embed_list = [] with tf.variable_scope('encoding') as scope: fw_cell = GRUCell(char_rnn_size) bw_cell = GRUCell(char_rnn_size) for i in range(len(char_raw_embed_list)): batch_embed = char_raw_embed_list[i] batch_char_length = char_raw_embed_length_list[i] (_, _), (fw_final, bw_final) = bidirectional_dynamic_rnn( fw_cell, bw_cell, inputs=batch_embed, dtype=tf.float32, sequence_length=batch_char_length, scope=scope, parallel_iterations=64, swap_memory=True) out = tf.concat([fw_final, bw_final], axis=1) char_embed_list.append(out) return tf.stack(char_embed_list, axis=0)
def __init__(self, feature_size, max_video_length, num_classes, cell_size, use_lstm, learning_rate, learning_rate_decay_factor, min_learning_rate, training_steps_per_epoch, max_gradient_norm, keep_prob=0.5, is_training=False): self.frame_feature_ph = tf.placeholder(tf.float32, [None, max_video_length, feature_size]) self.video_length_ph = tf.placeholder(tf.int32, [None]) self.video_label_ph = tf.placeholder(tf.int32, [None]) if is_training: self.global_step = tf.Variable(0, trainable=False) self.learning_rate = tf.maximum( tf.train.exponential_decay( learning_rate, self.global_step, training_steps_per_epoch, learning_rate_decay_factor, staircase=True), min_learning_rate) # Make RNN cells cell = GRUCell(cell_size) if use_lstm: cell = BasicLSTMCell(cell_size, state_is_tuple=False) # RNN with tf.variable_scope('DynamicRNN'): outputs, state = dynamic_rnn(cell=cell, inputs=self.frame_feature_ph, sequence_length=self.video_length_ph, dtype=tf.float32) state = tf.nn.relu(state) if is_training: state = tf.nn.dropout(state, keep_prob=keep_prob) if num_classes == 2: with tf.variable_scope('Classification'): logit = tf.contrib.layers.fully_connected(inputs=state, num_outputs=1, activation_fn=None) # [batch_size, 1] self.logit = tf.squeeze(logit) # [batch_size] if is_training: video_label = tf.cast(x=self.video_label_ph, dtype=tf.float32) self.loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=video_label, logits=self.logit)) else: self.prediction = tf.cast(tf.greater(x=logit, y=0.5), tf.int32) else: with tf.variable_scope('Classification'): self.logits = tf.contrib.layers.fully_connected(inputs=state, num_outputs=num_classes, activation_fn=None) # [batch_size, num_classes] if is_training: self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.video_label_ph, logits=self.logits)) else: self.prediction = tf.argmax(logits, 1) if is_training: params = tf.trainable_variables() gradients = tf.gradients(self.loss, params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.train_op = tf.train.AdamOptimizer(self.learning_rate).apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=99999999)
def match_par_qu_layer(self): with tf.variable_scope('alignment_par_qu') as scope: rnn_cell = MatchRNNCell(GRUCell(self.conf_layer_size), self.qu_encoded, self.conf_att_size) outputs, final_state = dynamic_rnn(rnn_cell, self.par_encoded, self.par_num_words, parallel_iterations=self.conf_rnn_parallelity, scope=scope, swap_memory=True, dtype=tf.float32) with tf.variable_scope('encoding'): outputs, _, _ = bi_gru_layer([self.conf_layer_size], self.apply_dropout(outputs), self.par_num_words, self.apply_dropout) return outputs
def __init__(self, frame_feature_ph, num_classes, cell_size, use_lstm=False): self.frame_feature_ph = frame_feature_ph cell = GRUCell(cell_size) if use_lstm: cell = BasicLSTMCell(cell_size, state_is_tuple=False) with tf.variable_scope('DynamicRNN'): outputs, state = dynamic_rnn(cell=cell, inputs=self.frame_feature_ph, dtype=tf.float32) outputs = tf.nn.relu(outputs) with tf.variable_scope('Classification'): node_logit = tf.contrib.layers.fully_connected(inputs=outputs, num_outputs=num_classes, activation_fn=None) logit = tf.nn.softmax(node_logit) self.logit = tf.nn.softmax(tf.reduce_mean(node_logit,1)) self.node = tf.argmax(logit, 2) self.prediction = tf.argmax(self.logit,1) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=99999999)
def pointer_net(passage, passage_length, question_pool, params, attention_fun, dropout): # question_pool = tf.nn.dropout(question_pool, 1 - dropout) attention_cell = attention_fun(memory=passage, memory_sequence_length=passage_length, name="pointer_attention", probability_fn=tf.identity, score_mask_value=0) p1, _ = attention_cell(question_pool, None) context = tf.reduce_sum(tf.expand_dims(tf.nn.softmax(p1), -1) * passage, 1) rnn = GRUCell(params.units * 2, name="pointer_gru") _, state = rnn(context, question_pool) # state = tf.nn.dropout(state, 1 - dropout) p2, _ = attention_cell(state, None) return p1, p2
def rnet_matching_layer_unidirectional( layer_size: int, att_size: int, par_vecs: tf.Variable, qu_vecs: tf.Variable, par_num_words: tf.Variable, parallel_iterations: int = 64) -> tf.Variable: with tf.variable_scope('alignment_par_qu') as scope: with tf.variable_scope('fw/match_rnn_cell/attention'): rnn_cell = MatchRNNCell(GRUCell(layer_size), qu_vecs, att_size) output, _ = dynamic_rnn(rnn_cell, inputs=par_vecs, dtype=tf.float32, sequence_length=par_num_words, scope=scope, parallel_iterations=parallel_iterations, swap_memory=True) return output
def train(epochs, batch_size): session_conf = tf.ConfigProto( # device_count={'GPU': gpu_count}, allow_soft_placement=allow_soft_placement, log_device_placement=log_device_placement, gpu_options=tf.GPUOptions(allow_growth=True)) # Training # ================================================== best_acc = 0 best_epoch = 0 best_report = '' gpu_device = 0 with tf.device('/device:GPU:%d' % gpu_device): print('Using GPU - ', '/device:GPU:%d' % gpu_device) with tf.Graph().as_default(): sess = tf.Session(config=session_conf) with sess.as_default(): seed = 1227 kernel_init = tf.glorot_uniform_initializer(seed=seed, dtype=tf.float32) bias_init = tf.zeros_initializer() word_cell = GRUCell(50, name='gru', activation=tf.nn.tanh, kernel_initializer=kernel_init, bias_initializer=bias_init) sent_cell = GRUCell(50, name='gru', activation=tf.nn.tanh, kernel_initializer=kernel_init, bias_initializer=bias_init) model = HAN_Model(vocab_size=vocab_size, embedding_size=200, classes=classes, word_cell=word_cell, sentence_cell=sent_cell, word_output_size=100, sentence_output_size=100, device=args.device, learning_rate=args.lr, dropout_keep_proba=0.5, scope='HANModel') sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) print("\nEvaluation before training:") # Evaluation after epoch validate(-1, model, sess, X_val, sent_length_val, word_length_val, y_val, batch_size) for epoch in range(epochs): epoch += 1 batches = batch_iter( list( zip(X_train, sent_length_train, word_length_train, y_train)), batch_size) # Training loop. For each batch... print('\nTraining epoch {}'.format(epoch)) l = [] a = [] for i, batch in tqdm(enumerate(list(batches))): X_batch, sent_len_batch, word_lenght_batch, y_batch = zip( *batch) # print('batch_hist_v', len(batch_utt_v)) feed_dict = { model.inputs: X_batch, model.sentence_lengths: sent_len_batch, model.word_lengths: word_lenght_batch, model.labels: y_batch, model.is_training: True, } _, step, loss, accuracy = sess.run([ model.train_op, model.global_step, model.loss, model.accuracy ], feed_dict) l.append(loss) a.append(accuracy) print("\t \tEpoch {}:, loss {:g}, Accuracy {:g}".format( epoch, np.average(l), np.average(a))) # Evaluation after epoch accuracy, report = validate(epoch, model, sess, X_val, sent_length_val, word_length_val, y_val, batch_size) if accuracy > best_acc: best_epoch = epoch best_acc = accuracy best_report = report print("\n\nBest epoch: {}\nBest test accuracy: {}".format( best_epoch, best_acc)) print("\n\nBest epoch: {}\nBest test report: \n{}".format( best_epoch, best_report))
def __init__(self, num_symbols, num_embed_units, num_units, name_scope, sequence_length, start_token, end_token, learning_rate=0.001, learning_rate_decay_factor=0.95, max_gradient_norm=5, num_samples=512, max_length=30): # Input: text_id and text_length self.sequence_length = sequence_length self.responses = tf.placeholder(tf.int32, shape=[None, None]) # (batch, len) self.responses_length = tf.placeholder(tf.int32, shape=[None, ]) # batch self.end_token = end_token # Build the embedding table (index to vector) self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) # Construct the input and output of GRU self.responses_target = self.responses batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape(self.responses)[1] self.responses_input = tf.concat([tf.ones([batch_size, 1], dtype=tf.int32)*start_token, tf.split(self.responses_target, [decoder_len-1, 1], 1)[0]], 1) # batch*len self.decoder_mask = tf.reshape(tf.cumsum(tf.one_hot(self.responses_length-1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # batch * len self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_input) cell_dec = GRUCell(num_units) encoder_state = tf.zeros([batch_size, num_units]) output_fn, sampled_sequence_loss = output_projection_layer(num_units, num_symbols, num_samples) # RNN language model with variable_scope.variable_scope('decoder'): decoder_fn_train = my_simple_decoder_fn.simple_decoder_fn_train(encoder_state) self.decoder_output, _, _ = my_seq2seq.dynamic_rnn_decoder(cell_dec, decoder_fn_train, self.decoder_input, self.responses_length, scope = "decoder_rnn") self.decoder_loss, self.all_decoder_output = my_loss.sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask, softmax_loss_function = sampled_sequence_loss) with variable_scope.variable_scope('decoder', reuse = True): decoder_fn_inference = my_simple_decoder_fn.simple_decoder_fn_inference(output_fn, encoder_state, self.embed, start_token, end_token, max_length, num_symbols) self.decoder_distribution, _, _ = my_seq2seq.dynamic_rnn_decoder(cell_dec, decoder_fn_inference, scope = "decoder_rnn") self.generation_index = tf.argmax(tf.split(self.decoder_distribution, [2, num_symbols-2], 2)[1], 2) + 2 # for removing UNK self.generation = self.generation_index self.params = [k for k in tf.trainable_variables() if name_scope in k.name] # Initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # Calculate the gradient of parameters self.cost = tf.reduce_mean(self.decoder_loss) opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.cost, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) all_variables = [k for k in tf.global_variables() if name_scope in k.name] self.saver = tf.train.Saver(all_variables, write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def Model(_abnormal_data, _abnormal_label, _hidden_num, _elem_num, _file_name, _partition): tf.reset_default_graph() g = tf.Graph() with g.as_default(): # placeholder list p_input = tf.placeholder(tf.float32, shape=(batch_num, _abnormal_data.shape[1], _abnormal_data.shape[2])) p_inputs = [ tf.squeeze(t, [1]) for t in tf.split(p_input, _abnormal_data.shape[1], 1) ] # projection_layer = tf.layers.Dense(units=_elem_num, use_bias=True) # with tf.device('/device:GPU:0'): d_enc = {} with tf.variable_scope('encoder'): for j in range(ensemble_space): if cell_type == 0: enc_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: pure_enc_cell = LSTMCell(_hidden_num) residual_enc_cell = RLSTMCell(_hidden_num, reuse=tf.AUTO_REUSE) enc_cell = RSLSTMCell(_hidden_num, file_name=_file_name, component=j, partition=_partition, type='enc', reuse=tf.AUTO_REUSE) if cell_type == 2: pure_enc_cell = GRUCell(_hidden_num) enc_cell = RSGRUCell(_hidden_num) if j == 0: enc_state = pure_enc_cell.zero_state(batch_size=batch_num, dtype=tf.float32) enc_outputs = [] for step in range(len(p_inputs)): enc_input = p_inputs[step] enc_output_, enc_state = pure_enc_cell( enc_input, enc_state) enc_outputs.append(enc_output_) d_enc['enc_output_{0}'.format(j)] = enc_outputs d_enc['enc_state_{0}'.format(j)] = enc_state elif j == 1: enc_state = residual_enc_cell.zero_state( batch_size=batch_num, dtype=tf.float32) enc_outputs = [] for step in range(len(p_inputs)): enc_input = p_inputs[step] enc_output_, enc_state = residual_enc_cell( enc_input, enc_state) enc_outputs.append(enc_output_) d_enc['enc_output_{0}'.format(j)] = enc_outputs d_enc['enc_state_{0}'.format(j)] = enc_state else: enc_state = enc_cell.zero_state(batch_size=batch_num, dtype=tf.float32) enc_outputs = [] for step in range(len(p_inputs)): enc_input = p_inputs[step] enc_output_, enc_state = enc_cell(enc_input, enc_state) enc_outputs.append(enc_output_) d_enc['enc_output_{0}'.format(j)] = enc_outputs d_enc['enc_state_{0}'.format(j)] = enc_state shared_state_c = tf.concat([ d_enc['enc_state_{0}'.format(j)].c for j in range(ensemble_space) ], axis=1) shared_state_h = tf.concat([ d_enc['enc_state_{0}'.format(j)].h for j in range(ensemble_space) ], axis=1) if compress: compress_state = tf.layers.Dense(units=_hidden_num, activation=tf.tanh, use_bias=True) shared_state_c = compress_state(shared_state_c) shared_state_h = compress_state(shared_state_h) shared_state = LSTMStateTuple(shared_state_c, shared_state_h) # with tf.device('/device:GPU:1'): d_dec = {} with tf.variable_scope('decoder') as vs: dec_weight_ = tf.Variable(tf.truncated_normal( [_hidden_num * ensemble_space, _elem_num], dtype=tf.float32), name="dec_weight") dec_bias_ = tf.Variable(tf.constant(0.1, shape=[_elem_num], dtype=tf.float32), name="dec_bias") if decode_without_input: for k in range(ensemble_space): if cell_type == 0: dec_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: if compress: pure_dec_cell = LSTMCell(_hidden_num) residual_dec_cell = RLSTMCell(_hidden_num) dec_cell = RSLSTMCell(_hidden_num, file_name=_file_name, component=k, partition=_partition, type='dec', reuse=tf.AUTO_REUSE) else: pure_dec_cell = LSTMCell(_hidden_num * ensemble_space) residual_dec_cell = RLSTMCell(_hidden_num * ensemble_space) dec_cell = RSLSTMCell(_hidden_num * ensemble_space, file_name=_file_name, component=k, partition=_partition, type='dec', reuse=tf.AUTO_REUSE) if cell_type == 2: if compress: pure_dec_cell = GRUCell(_hidden_num) dec_cell = RSGRUCell(_hidden_num) else: pure_dec_cell = GRUCell(_hidden_num * ensemble_space) dec_cell = RSGRUCell(_hidden_num * ensemble_space) if k == 0: dec_inputs = [ tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) for _ in range(len(p_inputs)) ] dec_outputs, dec_state = tf.contrib.rnn.static_rnn( pure_dec_cell, dec_inputs, initial_state=shared_state, dtype=tf.float32) elif k == 1: dec_inputs = [ tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) for _ in range(len(p_inputs)) ] dec_outputs, dec_state = tf.contrib.rnn.static_rnn( residual_dec_cell, dec_inputs, initial_state=shared_state, dtype=tf.float32) else: dec_inputs = [ tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) for _ in range(len(p_inputs)) ] dec_outputs, dec_state = tf.contrib.rnn.static_rnn( dec_cell, dec_inputs, initial_state=shared_state, dtype=tf.float32) if reverse: dec_outputs = dec_outputs[::-1] dec_output_ = tf.transpose(tf.stack(dec_outputs), [1, 0, 2]) dec_weight_ = tf.tile(tf.expand_dims(dec_weight_, 0), [batch_num, 1, 1]) d_dec['dec_output_{0}'.format(k)] = tf.matmul( dec_output_, dec_weight_) + dec_bias_ if reverse: d_dec['dec_output_{0}'.format(k)] = d_dec[ 'dec_output_{0}'.format(k)][::-1] else: for k in range(ensemble_space): if cell_type == 0: dec_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: if compress: pure_dec_cell = LSTMCell(_hidden_num) residual_dec_cell = RLSTMCell(_hidden_num, reuse=tf.AUTO_REUSE) dec_cell = RSLSTMCell(_hidden_num, file_name=_file_name, component=k, partition=_partition, type='dec', reuse=tf.AUTO_REUSE) else: pure_dec_cell = LSTMCell(_hidden_num * ensemble_space) residual_dec_cell = RLSTMCell(_hidden_num * ensemble_space, reuse=tf.AUTO_REUSE) dec_cell = RSLSTMCell(_hidden_num * ensemble_space, file_name=_file_name, component=k, partition=_partition, type='dec', reuse=tf.AUTO_REUSE) if cell_type == 2: if compress: pure_dec_cell = GRUCell(_hidden_num) dec_cell = RSGRUCell(_hidden_num) else: pure_dec_cell = GRUCell(_hidden_num * ensemble_space) dec_cell = RSGRUCell(_hidden_num * ensemble_space) if k == 0: dec_state = shared_state dec_input_ = tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) dec_outputs = [] for step in range(len(p_inputs)): if step > 0: vs.reuse_variables() dec_input_, dec_state = pure_dec_cell( dec_input_, dec_state) dec_input_ = tf.matmul(dec_input_, dec_weight_) + dec_bias_ dec_outputs.append(dec_input_) elif k == 1: dec_state = shared_state dec_input_ = tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) dec_outputs = [] for step in range(len(p_inputs)): if step > 0: vs.reuse_variables() dec_input_, dec_state = residual_dec_cell( dec_input_, dec_state) dec_input_ = tf.matmul(dec_input_, dec_weight_) + dec_bias_ dec_outputs.append(dec_input_) else: dec_state = shared_state dec_input_ = tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) dec_outputs = [] for step in range(len(p_inputs)): if step > 0: vs.reuse_variables() dec_input_, dec_state = dec_cell( dec_input_, dec_state) dec_input_ = tf.matmul(dec_input_, dec_weight_) + dec_bias_ dec_outputs.append(dec_input_) d_dec['dec_output_{0}'.format(k)] = dec_outputs if reverse: d_dec['dec_output_{0}'.format(k)] = d_dec[ 'dec_output_{0}'.format(k)][::-1] sum_of_difference = 0 for i in range(ensemble_space): sum_of_difference += d_dec['dec_output_{0}'.format(i)][0] - p_input loss = tf.reduce_mean(tf.square(sum_of_difference)) optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss) # Add ops to save and restore all the variables. saver = tf.train.Saver() return g, p_input, d_dec, loss, optimizer, saver
def _build_decoder(self): with tf.variable_scope("dialog_decoder"): with tf.variable_scope("decoder_output_projection"): # 全连接层 output_layer = layers_core.Dense( self.config.vocab_size, use_bias=False, name="output_projection") # units单元个数 词表大小 with tf.variable_scope("decoder_rnn"): attn_mech = tc_seq2seq.BahdanauAttention( self.config.dec_hidden_size, self.word_outputs, None) attn_mech1 = tc_seq2seq.BahdanauAttention( self.config.dec_hidden_size, self.uttn_outputs, None) attn_mech2 = tc_seq2seq.BahdanauAttention( self.config.dec_hidden_size, self.encoder_outputs, None) self.att1 = attn_mech.batch_size self.att2 = attn_mech.batch_size self.att3 = attn_mech.batch_size dec_cell = GRUCell(self.config.dec_hidden_size) #dec_cell = grucell_cond.GRUCellCond(self.config.dec_hidden_size) #self.encoder_outputs = tf.reshape(self.encoder_outputs,[-1,self.config.dec_hidden_size*2]) #dec_cell = grucell_cond.CondWrapper(dec_cell, self.encoder_outputs) #word_outputs = tf.reshape(self.word_outputs,[self.batch_size,-1]) dec_cell = EAttentionWrapper( dec_cell, [attn_mech, attn_mech1, attn_mech2], attention_layer_size=[ self.config.dec_hidden_size, self.config.dec_hidden_size, self.config.dec_hidden_size ]) #print('self.batch_size',self.batch_size) dec_init_state = dec_cell.zero_state( batch_size=self.batch_size, dtype=tf.float32) # Training or Eval if self.mode != ModelMode.infer: # not infer, do decode turn by turn resp_emb_inp = tf.nn.embedding_lookup( self.decoder_embeddings, self.target_input) helper = tc_seq2seq.TrainingHelper(resp_emb_inp, self.target_length) decoder = tc_seq2seq.BasicDecoder( cell=dec_cell, helper=helper, initial_state=dec_init_state, # 编码层的最终状态 output_layer=output_layer # 全连接层 ) dec_outputs, dec_state, _ = tc_seq2seq.dynamic_decode( decoder) sample_id = dec_outputs.sample_id logits = dec_outputs.rnn_output else: start_tokens = tf.fill([self.batch_size], self.config.sos_idx) end_token = self.config.eos_idx maximum_iterations = tf.to_int32(self.config.infer_max_len) helper = tc_seq2seq.GreedyEmbeddingHelper( self.decoder_embeddings, start_tokens=start_tokens, end_token=tf.constant(end_token, dtype=tf.int32)) decoder = tc_seq2seq.BasicDecoder( cell=dec_cell, helper=helper, initial_state=dec_init_state, output_layer=output_layer # 全连接层 ) dec_outputs, dec_state, _ = tc_seq2seq.dynamic_decode( decoder, maximum_iterations=maximum_iterations) logits = tf.no_op() sample_id = dec_outputs.sample_id self.logits = logits self.sample_id = sample_id
def __init__(self, num_symbols, num_embed_units, num_units, vocab=None, embed=None, name_scope=None, learning_rate=0.0001, learning_rate_decay_factor=0.95, max_gradient_norm=5, l2_lambda=0.2): self.posts = tf.placeholder(tf.string, shape=[None, None]) # batch * len self.posts_length = tf.placeholder(tf.int32, shape=[None]) # batch self.responses = tf.placeholder(tf.string, shape=[None, None]) # batch*len self.responses_length = tf.placeholder(tf.int32, shape=[None]) # batch self.generation = tf.placeholder(tf.string, shape=[None, None]) # batch*len self.generation_length = tf.placeholder(tf.int32, shape=[None]) # batch # build the vocab table (string to index) self.symbols = tf.Variable(vocab, trainable=False, name="symbols") self.symbol2index = HashTable(KeyValueTensorInitializer( self.symbols, tf.Variable( np.array([i for i in range(num_symbols)], dtype=np.int32), False)), default_value=UNK_ID, name="symbol2index") # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.posts_input = self.symbol2index.lookup( self.posts) # batch * utter_len self.posts_input_embed = tf.nn.embedding_lookup( self.embed, self.posts_input) #batch * utter_len * embed_unit self.responses_input = self.symbol2index.lookup(self.responses) self.responses_input_embed = tf.nn.embedding_lookup( self.embed, self.responses_input) # batch * utter_len * embed_unit self.generation_input = self.symbol2index.lookup(self.generation) self.generation_input_embed = tf.nn.embedding_lookup( self.embed, self.generation_input) # batch * utter_len * embed_unit # Construct bidirectional GRU cells for encoder / decoder cell_fw_post = GRUCell(num_units) cell_bw_post = GRUCell(num_units) cell_fw_resp = GRUCell(num_units) cell_bw_resp = GRUCell(num_units) # Encode the post sequence with variable_scope.variable_scope("post_encoder"): posts_state, posts_final_state = tf.nn.bidirectional_dynamic_rnn( cell_fw_post, cell_bw_post, self.posts_input_embed, self.posts_length, dtype=tf.float32) posts_final_state_bid = tf.concat( posts_final_state, 1) # batch_size * (2 * num_units) # Encode the real response sequence with variable_scope.variable_scope("resp_encoder"): responses_state, responses_final_state = tf.nn.bidirectional_dynamic_rnn( cell_fw_resp, cell_bw_resp, self.responses_input_embed, self.responses_length, dtype=tf.float32) responses_final_state_bid = tf.concat(responses_final_state, 1) # Encode the generated response sequence with variable_scope.variable_scope("resp_encoder", reuse=True): generation_state, generation_final_state = tf.nn.bidirectional_dynamic_rnn( cell_fw_resp, cell_bw_resp, self.generation_input_embed, self.generation_length, dtype=tf.float32) generation_final_state_bid = tf.concat(generation_final_state, 1) # Calculate the relevance score between post and real response with variable_scope.variable_scope("calibration"): self.W = tf.get_variable('W', [2 * num_units, 2 * num_units], tf.float32) vec_post = tf.reshape(posts_final_state_bid, [-1, 1, 2 * num_units]) vec_resp = tf.reshape(responses_final_state_bid, [-1, 2 * num_units, 1]) attn_score_true = tf.einsum( 'aij,ajk->aik', tf.einsum('aij,jk->aik', vec_post, self.W), vec_resp) attn_score_true = tf.reshape(attn_score_true, [-1, 1]) fc_true_input = tf.concat([ posts_final_state_bid, responses_final_state_bid, attn_score_true ], 1) self.output_fc_W = tf.get_variable("output_fc_W", [4 * num_units + 1, num_units], tf.float32) self.output_fc_b = tf.get_variable("output_fc_b", [num_units], tf.float32) fc_true = tf.nn.tanh( tf.nn.xw_plus_b(fc_true_input, self.output_fc_W, self.output_fc_b)) # batch_size self.output_W = tf.get_variable("output_W", [num_units, 1], tf.float32) self.output_b = tf.get_variable("output_b", [1], tf.float32) self.cost_true = tf.nn.sigmoid( tf.nn.xw_plus_b(fc_true, self.output_W, self.output_b)) # batch_size # Calculate the relevance score between post and generated response with variable_scope.variable_scope("calibration", reuse=True): vec_gen = tf.reshape(generation_final_state_bid, [-1, 2 * num_units, 1]) attn_score_false = tf.einsum( 'aij,ajk->aik', tf.einsum('aij,jk->aik', vec_post, self.W), vec_gen) attn_score_false = tf.reshape(attn_score_false, [-1, 1]) fc_false_input = tf.concat([ posts_final_state_bid, generation_final_state_bid, attn_score_false ], 1) fc_false = tf.nn.tanh( tf.nn.xw_plus_b(fc_false_input, self.output_fc_W, self.output_fc_b)) # batch_size self.cost_false = tf.nn.sigmoid( tf.nn.xw_plus_b(fc_false, self.output_W, self.output_b)) # batch_size self.PR_cost = tf.reduce_mean( tf.reduce_sum(tf.square(self.cost_true - 1.0), axis=1)) self.PG_cost = tf.reduce_mean( tf.reduce_sum(tf.square(self.cost_false), axis=1)) # Use the loss similar to least square GAN self.cost = self.PR_cost / 2.0 + self.PG_cost / 2.0 + l2_lambda * ( tf.nn.l2_loss(self.output_fc_W) + tf.nn.l2_loss(self.output_fc_b) + tf.nn.l2_loss(self.output_W) + tf.nn.l2_loss(self.output_b) + tf.nn.l2_loss(self.W)) # building graph finished and get all parameters self.params = [ k for k in tf.trainable_variables() if name_scope in k.name ] # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.adv_global_step = tf.Variable(0, trainable=False) # calculate the gradient of parameters opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.cost, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.reward = tf.reduce_sum(self.cost_false, axis=1) # batch all_variables = [ k for k in tf.global_variables() if name_scope in k.name ] self.saver = tf.train.Saver(all_variables, write_version=tf.train.SaverDef.V2, max_to_keep=5, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.adv_saver = tf.train.Saver(all_variables, write_version=tf.train.SaverDef.V2, max_to_keep=5, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def attention_decoder(inputs, memory, num_units=None, batch_size=1, inputs_length=None, n_mels=80, reduction=1, default_max_iters=200, is_training=True, scope='attention_decoder', reuse=None): """ Applies a GRU to 'inputs', while attending 'memory'. :param inputs: A 3d tensor with shape of [N, T', C']. Decoder inputs. :param memory: A 3d tensor with shape of [N, T, C]. Outputs of encoder network. :param num_units: An int. Attention size. :param batch_size: An int. Batch size. :param inputs_length: An int. Memory length. :param n_mels: An int. Number of Mel banks to generate. :param reduction: An int. Reduction factor. Paper => 2, 3, 5. :param default_max_iters: Default max iteration of decoding. :param is_training: running mode. :param scope: Optional scope for `variable_scope`. :param reuse: Boolean, whether to reuse the weights of a previous layer by the same name. :return: A 3d tensor with shape of [N, T, num_units]. """ with tf.variable_scope(scope, reuse=reuse): # params setting if is_training: max_iters = None else: max_iters = default_max_iters # max_iters = default_max_iters if num_units is None: num_units = inputs.get_shape().as_list()[-1] # Decoder cell decoder_cell = tf.nn.rnn_cell.GRUCell(num_units) # Attention # [N, T_in, attention_depth] attention_cell = AttentionWrapper(decoder_cell, BahdanauAttention(num_units, memory), alignment_history=True) # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector. # [N, T_in, 2*attention_depth] concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): # [N, T_in, decoder_depth] decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, num_units), ResidualWrapper(GRUCell(num_units)), ResidualWrapper(GRUCell(num_units)) ], state_is_tuple=True) # Project onto r mel spectrogram (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, n_mels * reduction) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: # helper = TacotronTrainingHelper(batch_size, n_mels, reduction, inputs) helper = tf.contrib.seq2seq.TrainingHelper( inputs=inputs, sequence_length=inputs_length, time_major=False) else: helper = TacotronInferenceHelper(batch_size, n_mels, reduction) decoder = BasicDecoder(output_cell, helper, decoder_init_state) # [N, T_out/r, M*r] (decoder_outputs, _), final_decoder_state, _ = dynamic_decode( decoder, maximum_iterations=max_iters) return decoder_outputs, final_decoder_state
def Model(_abnormal_data, _abnormal_label, _hidden_num, _elem_num, _file_name, _partition): tf.reset_default_graph() g = tf.Graph() with g.as_default(): # placeholder list p_input = tf.placeholder(tf.float32, shape=(batch_num, _abnormal_data.shape[1], _abnormal_data.shape[2])) # p_inputs = [tf.squeeze(t, [1]) for t in tf.split(p_input, _abnormal_data.shape[1], 1)] # Regularizer signature l1_regularizer = tf.contrib.layers.l1_regularizer(scale=0.005, scope=None) # Projection layer projection_layer = tf.layers.Dense(units=_elem_num, use_bias=True) # with tf.device('/device:GPU:0'): d_enc = {} with tf.variable_scope('encoder'): for j in range(ensemble_space): # create RNN cell if cell_type == 0: enc_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: pure_enc_cell = LSTMCell(_hidden_num) residual_enc_cell = RLSTMCell(_hidden_num) # enc_cell = RSLSTMCell(_hidden_num, file_name=_file_name, type='enc', partition=_partition, # component=j, reuse=tf.AUTO_REUSE) enc_cell = RKLSTMCell(_hidden_num, file_name=_file_name, type='enc', partition=_partition, component=j, reuse=tf.AUTO_REUSE) if cell_type == 2: pure_enc_cell = GRUCell(_hidden_num) enc_cell = RSGRUCell(_hidden_num) if j == 0: d_enc['enc_output_{0}'.format(j)], d_enc[ 'enc_state_{0}'.format(j)] = tf.nn.dynamic_rnn( pure_enc_cell, p_input, dtype=tf.float32) elif j == 1: d_enc['enc_output_{0}'.format(j)], d_enc[ 'enc_state_{0}'.format(j)] = tf.nn.dynamic_rnn( residual_enc_cell, p_input, dtype=tf.float32) else: d_enc['enc_output_{0}'.format(j)], d_enc[ 'enc_state_{0}'.format(j)] = tf.nn.dynamic_rnn( enc_cell, p_input, dtype=tf.float32) # shared_state_c = tf.concat([d_enc['enc_state_{0}'.format(j)].c for j in range(ensemble_space)], axis=1) # shared_state_h = tf.concat([d_enc['enc_state_{0}'.format(j)].h for j in range(ensemble_space)], axis=1) w_c = tf.Variable(tf.zeros([_hidden_num, _hidden_num])) b_c = tf.Variable(tf.zeros([_hidden_num])) w_h = tf.Variable(tf.zeros([_hidden_num, _hidden_num])) b_h = tf.Variable(tf.zeros([_hidden_num])) shared_state_c = tf.concat([ tf.matmul(d_enc['enc_state_{0}'.format(j)].c, w_c) + b_c for j in range(ensemble_space) ], axis=1) shared_state_h = tf.concat([ tf.matmul(d_enc['enc_state_{0}'.format(j)].h, w_h) + b_h for j in range(ensemble_space) ], axis=1) if compress: compress_state = tf.layers.Dense(units=_hidden_num, activation=tf.tanh, use_bias=True) shared_state_c = compress_state(shared_state_c) shared_state_h = compress_state(shared_state_h) shared_state = LSTMStateTuple(shared_state_c, shared_state_h) # with tf.device('/device:GPU:1'): d_dec = {} with tf.variable_scope('decoder') as vs: if decode_without_input: dec_input = tf.zeros( [p_input.shape[0], p_input.shape[1], p_input.shape[2]], dtype=tf.float32) for k in range(ensemble_space): # create RNN cell if cell_type == 0: dec_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: if compress: pure_dec_cell = LSTMCell(_hidden_num) residual_dec_cell = RLSTMCell(_hidden_num) dec_cell = RSLSTMCell(_hidden_num, file_name=_file_name, type='dec', partition=_partition, component=k, reuse=tf.AUTO_REUSE) else: pure_dec_cell = LSTMCell(_hidden_num * ensemble_space) residual_dec_cell = RLSTMCell(_hidden_num * ensemble_space) dec_cell = RSLSTMCell(_hidden_num * ensemble_space, file_name=_file_name, type='dec', partition=_partition, component=k, reuse=tf.AUTO_REUSE) if cell_type == 2: if compress: pure_dec_cell = GRUCell(_hidden_num) dec_cell = RSGRUCell(_hidden_num) else: pure_dec_cell = GRUCell(_hidden_num * ensemble_space) dec_cell = RSGRUCell(_hidden_num * ensemble_space) if k == 0: d_dec['dec_output_{0}'.format(k)], d_dec[ 'dec_state_{0}'.format(k)] = tf.nn.dynamic_rnn( pure_dec_cell, dec_input, initial_state=shared_state, dtype=tf.float32) elif k == 1: d_dec['dec_output_{0}'.format(k)], d_dec[ 'dec_state_{0}'.format(k)] = tf.nn.dynamic_rnn( residual_dec_cell, dec_input, initial_state=shared_state, dtype=tf.float32) else: d_dec['dec_output_{0}'.format(k)], d_dec[ 'dec_state_{0}'.format(k)] = tf.nn.dynamic_rnn( dec_cell, dec_input, initial_state=shared_state, dtype=tf.float32) if reverse: d_dec['dec_output_{0}'.format(k)] = d_dec[ 'dec_output_{0}'.format(k)][::-1] else: dec_input = tf.zeros([p_input.shape[0], p_input.shape[2]], dtype=tf.float32) for k in range(ensemble_space): # create RNN cell if cell_type == 0: dec_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: if compress: pure_dec_cell = LSTMCell(_hidden_num) residual_dec_cell = RLSTMCell(_hidden_num) # dec_cell = RSLSTMCell(_hidden_num, file_name=_file_name, type='dec', partition=_partition, # component=k, reuse=tf.AUTO_REUSE) dec_cell = RKLSTMCell(_hidden_num, file_name=_file_name, type='dec', partition=_partition, component=k, reuse=tf.AUTO_REUSE) else: pure_dec_cell = LSTMCell(_hidden_num * ensemble_space) residual_dec_cell = RLSTMCell(_hidden_num * ensemble_space) # dec_cell = RSLSTMCell(_hidden_num * ensemble_space, file_name=_file_name, type='dec', # partition=_partition, component=k, reuse=tf.AUTO_REUSE) dec_cell = RKLSTMCell(_hidden_num * ensemble_space, file_name=_file_name, type='dec', partition=_partition, component=k, reuse=tf.AUTO_REUSE) if cell_type == 2: if compress: pure_dec_cell = GRUCell(_hidden_num) dec_cell = RSGRUCell(_hidden_num) else: pure_dec_cell = GRUCell(_hidden_num * ensemble_space) dec_cell = RSGRUCell(_hidden_num * ensemble_space) inference_helper = tf.contrib.seq2seq.InferenceHelper( sample_fn=lambda outputs: outputs, sample_shape=[_elem_num], sample_dtype=tf.float32, start_inputs=dec_input, end_fn=lambda sample_ids: False) if k == 0: inference_decoder = tf.contrib.seq2seq.BasicDecoder( pure_dec_cell, inference_helper, shared_state, output_layer=projection_layer) elif k == 1: inference_decoder = tf.contrib.seq2seq.BasicDecoder( residual_dec_cell, inference_helper, shared_state, output_layer=projection_layer) else: inference_decoder = tf.contrib.seq2seq.BasicDecoder( dec_cell, inference_helper, shared_state, output_layer=projection_layer) d_dec['dec_output_{0}'.format( k)], _, _ = tf.contrib.seq2seq.dynamic_decode( inference_decoder, impute_finished=True, maximum_iterations=p_input.shape[1]) if reverse: d_dec['dec_output_{0}'.format(k)] = d_dec[ 'dec_output_{0}'.format(k)][::-1] sum_of_difference = 0 for i in range(ensemble_space): sum_of_difference += d_dec['dec_output_{0}'.format(i)][0] - p_input loss = tf.reduce_mean(tf.square(sum_of_difference)) regularization_penalty = tf.contrib.layers.apply_regularization( l1_regularizer, [shared_state]) loss = loss + regularization_penalty optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss) # Add ops to save and restore all the variables. saver = tf.train.Saver() return g, p_input, d_dec, loss, optimizer, saver
def gru_cell(): """gru核""" return GRUCell(self.config.hidden_dim)
def Model(_j, _abnormal_data, _abnormal_label, _hidden_num, _elem_num, _file_name, _partition): tf.reset_default_graph() g = tf.Graph() with g.as_default(): # placeholder list p_input = tf.placeholder(tf.float32, shape=(batch_num, _abnormal_data.shape[1], _abnormal_data.shape[2])) p_inputs = [ tf.squeeze(t, [1]) for t in tf.split(p_input, _abnormal_data.shape[1], 1) ] # create RNN cell if cell_type == 0: enc_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) dec_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: pure_enc_cell = LSTMCell(_hidden_num) pure_dec_cell = LSTMCell(_hidden_num) residual_enc_cell = RLSTMCell(_hidden_num) residual_dec_cell = RLSTMCell(_hidden_num) enc_cell = RSLSTMCell(_hidden_num, file_name=_file_name, component=_j, partition=_partition, type='enc') dec_cell = RSLSTMCell(_hidden_num, file_name=_file_name, component=_j, partition=_partition, type='dec') if cell_type == 2: pure_enc_cell = GRUCell(_hidden_num) pure_dec_cell = GRUCell(_hidden_num) enc_cell = RGRUCell(_hidden_num) dec_cell = RGRUCell(_hidden_num) # projection_layer = tf.layers.Dense(units=_elem_num, use_bias=True) # with tf.device('/device:GPU:0'): with tf.variable_scope("encoder"): if _j == 0: enc_state = pure_enc_cell.zero_state(batch_size=batch_num, dtype=tf.float32) enc_outputs = [] for step in range(len(p_inputs)): enc_input = p_inputs[step] enc_output_, enc_state = pure_enc_cell( enc_input, enc_state) enc_outputs.append(enc_output_) elif _j == 1: enc_state = residual_enc_cell.zero_state(batch_size=batch_num, dtype=tf.float32) enc_outputs = [] for step in range(len(p_inputs)): enc_output_, enc_state = residual_enc_cell( p_inputs[step], enc_state) enc_outputs.append(enc_output_) else: enc_state = enc_cell.zero_state(batch_size=batch_num, dtype=tf.float32) enc_outputs = [] for step in range(len(p_inputs)): enc_output_, enc_state = enc_cell(p_inputs[step], enc_state) enc_outputs.append(enc_output_) # with tf.device('/device:GPU:1'): with tf.variable_scope('decoder') as vs: dec_weight_ = tf.Variable(tf.truncated_normal( [_hidden_num, _elem_num], dtype=tf.float32), name="dec_weight") dec_bias_ = tf.Variable(tf.constant(0.1, shape=[_elem_num], dtype=tf.float32), name="dec_bias") if decode_without_input: if _j == 0: dec_inputs = [ tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) for _ in range(len(p_inputs)) ] dec_outputs, dec_state = tf.contrib.rnn.static_rnn( pure_dec_cell, dec_inputs, initial_state=enc_state, dtype=tf.float32) elif _j == 1: dec_inputs = [ tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) for _ in range(len(p_inputs)) ] dec_outputs, dec_state = tf.contrib.rnn.static_rnn( residual_dec_cell, dec_inputs, initial_state=enc_state, dtype=tf.float32) else: dec_inputs = [ tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) for _ in range(len(p_inputs)) ] dec_outputs, dec_state = tf.contrib.rnn.static_rnn( dec_cell, dec_inputs, initial_state=enc_state, dtype=tf.float32) if reverse: dec_outputs = dec_outputs[::-1] dec_output_ = tf.transpose(tf.stack(dec_outputs), [1, 0, 2]) dec_weight_ = tf.tile(tf.expand_dims(dec_weight_, 0), [batch_num, 1, 1]) dec_outputs = tf.matmul(dec_output_, dec_weight_) + dec_bias_ else: if _j == 0: dec_state = enc_state dec_input_ = tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) dec_outputs = [] for step in range(len(p_inputs)): if step > 0: vs.reuse_variables() dec_input_, dec_state = pure_dec_cell( dec_input_, dec_state) dec_input_ = tf.matmul(dec_input_, dec_weight_) + dec_bias_ dec_outputs.append(dec_input_) elif _j == 1: dec_state = enc_state dec_input_ = tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) dec_outputs = [] for step in range(len(p_inputs)): if step > 0: vs.reuse_variables() dec_input_, dec_state = residual_dec_cell( dec_input_, dec_state) dec_input_ = tf.matmul(dec_input_, dec_weight_) + dec_bias_ dec_outputs.append(dec_input_) else: dec_state = enc_state dec_input_ = tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) dec_outputs = [] for step in range(len(p_inputs)): if step > 0: vs.reuse_variables() dec_input_, dec_state = dec_cell(dec_input_, dec_state) dec_input_ = tf.matmul(dec_input_, dec_weight_) + dec_bias_ dec_outputs.append(dec_input_) if reverse: dec_outputs = dec_outputs[::-1] loss = tf.reduce_mean(tf.square(p_input - dec_outputs)) optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss) # Add ops to save and restore all the variables. saver = tf.train.Saver() return g, p_input, dec_outputs, loss, optimizer, saver
def model_fn(features, labels, mode, params, word_embeddings_np=None, char_embeddings_np=None): attention_fun = partial(BahdanauAttention, num_units=params.units) if params.attention == 'bahdanau' \ else partial(LuongAttention, num_units=2 * params.units) dropout = params.dropout if mode == tf.estimator.ModeKeys.TRAIN else 0.0 passage_count = params.passage_count if mode != tf.estimator.ModeKeys.TRAIN \ else params.train_passage_count question_words_length = features['question_length'] passage_words_length = features['passage_length'] devices = get_devices() with tf.device('/cpu:0'): word_embeddings_placeholder = tf.placeholder( shape=[params.vocab_size, params.emb_size], dtype=tf.float32) char_embeddings_placeholder = tf.placeholder( shape=[params.char_vocab_size, params.char_emb_size], dtype=tf.float32) # word_embeddings = tf.create_partitioned_variables(shape=[params.vocab_size, params.emb_size], # slicing=[10, 1], # initializer=word_embeddings_placeholder, # trainable=False, name="word_embeddings") word_embeddings = tf.Variable(word_embeddings_placeholder, trainable=False, name="word_embeddings") char_embeddings = tf.Variable(char_embeddings_placeholder, trainable=False, name="char_embeddings") word_embeddings = tf.nn.dropout(word_embeddings, 1.0 - dropout, noise_shape=[params.vocab_size, 1]) char_embeddings = tf.nn.dropout( char_embeddings, 1.0 - dropout, noise_shape=[params.char_vocab_size, 1]) question_words_emb = tf.nn.embedding_lookup(word_embeddings, features['question_words']) question_chars_emb = tf.nn.embedding_lookup(char_embeddings, features['question_chars']) passage_words_emb = tf.nn.embedding_lookup(word_embeddings, features['passage_words']) passage_chars_emb = tf.nn.embedding_lookup(char_embeddings, features['passage_chars']) with tf.device(next(devices)): with tf.variable_scope('question_encoding'): question_enc = encoder(question_words_emb, question_words_length, question_chars_emb, features['question_char_length'], params, dropout=dropout) with tf.device(next(devices)): with tf.variable_scope('passage_encoding'): passage_enc = encoder(passage_words_emb, passage_words_length, passage_chars_emb, features['passage_char_length'], params, dropout=dropout) # question_enc = tf.Print(question_enc, [question_enc], summarize=1000) with tf.variable_scope('attention'): attention = attention_fun( memory=question_enc, memory_sequence_length=question_words_length) cell_fw = GatedAttentionWrapper( attention, DropoutWrapper( GRUCell(params.units, name="attention_gru"), # output_keep_prob=1.0 - dropout, input_keep_prob=1.0 - dropout, # state_keep_prob=1.0 - dropout, variational_recurrent=True, input_size=4 * params.units, dtype=tf.float32), dropout=0) cell_bw = GatedAttentionWrapper( attention, DropoutWrapper( GRUCell(params.units, name="attention_gru"), # output_keep_prob=1.0 - dropout, input_keep_prob=1.0 - dropout, # state_keep_prob=1.0 - dropout variational_recurrent=True, input_size=4 * params.units, dtype=tf.float32), dropout=0) passage_repr, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, passage_enc, passage_words_length, dtype=tf.float32) passage_repr = tf.concat(passage_repr, -1) with tf.variable_scope('pointer'): question_att = attention_fun( memory=question_enc, memory_sequence_length=question_words_length, name="question_align") pool_param = tf.get_variable('pool_param', shape=(question_att._num_units, ), initializer=tf.initializers.ones) pool_param = tf.reshape( tf.tile(pool_param, [tf.shape(question_enc)[0]]), (-1, question_att._num_units)) question_alignments, _ = question_att(pool_param, None) question_pool = tf.reduce_sum( tf.expand_dims(question_alignments, -1) * question_enc, 1) logits1, logits2 = pointer_net(passage_repr, passage_words_length, question_pool, params, attention_fun=attention_fun, dropout=dropout) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) p1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) p2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'start': p1, 'end': p2} export_outputs = { 'prediction': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs) with tf.variable_scope('passage_ranking'): W_g = Dense(params.units, activation=tf.tanh, use_bias=False) v_g = Dense(1, use_bias=False) memory_layer = Dense(params.units, name="memory_layer", use_bias=False, dtype=tf.float32) query_layer = Dense(params.units, name="query_layer", use_bias=False, dtype=tf.float32) g = [] for i in range(passage_count): passage_mask = tf.boolean_mask( passage_repr, tf.equal(features['partitions'], i)) passage_i = tf.split(passage_mask, features['partitions_len'][:, i]) passage_i = [ pad_to_shape_2d( p, (tf.Dimension(params.passage_max_len), p.shape[1])) for p in passage_i ] passage_i = tf.stack(passage_i) passage_alignment, _ = ReusableBahdanauAttention( params.units, passage_i, features['partitions_len'][:, i], memory_layer=memory_layer, query_layer=query_layer, name="passage_align")(question_pool, None) passage_pool = tf.reduce_sum( tf.expand_dims(passage_alignment, -1) * passage_i, 1) g_i = v_g(W_g(tf.concat([question_pool, passage_pool], -1))) # g_i = tf.Print(g_i, [passage_mask, passage_i], message='is_nan_{}'.format(i), summarize=1000) g.append(g_i) g = tf.concat(g, -1) answer_start, answer_end, passage_rank = labels loss1 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(answer_start)) loss2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=tf.stop_gradient(answer_end)) loss3 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=g, labels=tf.stop_gradient(passage_rank)) # loss1 = tf.Print(loss1, [tf.argmax(answer_start, -1), tf.argmax(answer_end, -1), # tf.reduce_mean(loss1), tf.reduce_mean(loss2), tf.reduce_mean(loss3)], message="loss") loss = (params.r * tf.reduce_mean(loss1 + loss2) + (1 - params.r) * tf.reduce_mean(loss3)) \ if params.r < 1 else tf.reduce_mean(loss1 + loss2) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdadeltaOptimizer( learning_rate=params.learning_rate, epsilon=1e-6) global_step = tf.train.get_or_create_global_step() grads = optimizer.compute_gradients(loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm(gradients, params.grad_clip) train_op = optimizer.apply_gradients(zip(capped_grads, variables), global_step=global_step) return EstimatorSpec( mode, loss=loss, train_op=train_op, scaffold=tf.train.Scaffold( init_feed_dict={ word_embeddings_placeholder: word_embeddings_np, char_embeddings_placeholder: char_embeddings_np }), ) if mode == tf.estimator.ModeKeys.EVAL: table = lookup_ops.index_to_string_table_from_file( params.word_vocab_file, value_column_index=0, delimiter=" ") return EstimatorSpec(mode, loss=loss, eval_metric_ops={ 'rouge-l': extraction_metric(p1, p2, tf.argmax(answer_start, -1), tf.argmax(answer_end, -1), features['passage_words'], params, table), 'f1': extraction_metric(p1, p2, tf.argmax(answer_start, -1), tf.argmax(answer_end, -1), features['passage_words'], params, table, metric='f1') })
def single_cell(): return GRUCell(rnnHiddenSize)
def __init__(self, num_emb, batch_size, emb_dim, hidden_dim, sequence_length, l2_reg_lambda=0): self.filter_sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20] self.num_filters = [100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160, 160] self.vocab_size = num_emb self.batch_size = batch_size self.embedding_size = emb_dim self.hidden_dim = hidden_dim self.sequence_length = sequence_length self.r_params = [] self.grad_clip = 5.0 self.input_x = tf.placeholder(tf.int32, [None, self.sequence_length], name="input_x") self.input_y = tf.placeholder(tf.float32, [None, ], name="input_y") self.dis_learning_rate = tf.placeholder(tf.float32, name="lr") self.dropout_keep_prob = tf.placeholder(tf.float32, name="drop_rate") self.l2_loss = tf.constant(0.0) with tf.variable_scope('rewarder'): # Embedding layer with tf.device('/cpu:0'), tf.name_scope("embedding"): self.W = tf.Variable( tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0), name="W") self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) # (batch_size, sequence_length, embedding_size) # Encode the text with GRU cell_enc = GRUCell(self.hidden_dim) encoder_output, _ = tf.nn.dynamic_rnn(cell_enc, self.embedded_chars, dtype=tf.float32) # batch_size, sequence_length, hidden_dim self.embedded_chars_expanded = tf.expand_dims(encoder_output, -1) # Construct convolution and maxpool layer pooled_outputs = [] for filter_size, num_filter in zip(self.filter_sizes, self.num_filters): with tf.name_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, self.hidden_dim, 1, num_filter] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filter]), name="b") conv = tf.nn.conv2d(self.embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool(h, ksize=[1, self.sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = sum(self.num_filters) self.h_pool = tf.concat(pooled_outputs, 3) self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) # Add highway with tf.name_scope("highway"): self.h_highway = highway(self.h_pool_flat, self.h_pool_flat.get_shape()[1], 1, 0) # Add dropout with tf.name_scope("dropout"): self.h_drop = tf.nn.dropout(self.h_highway, self.dropout_keep_prob) # Final scores with tf.name_scope("output"): W = tf.Variable(tf.truncated_normal([num_filters_total, 1], stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[1]), name="b") self.l2_loss += tf.nn.l2_loss(W) self.l2_loss += tf.nn.l2_loss(b) self.scores = tf.nn.sigmoid(tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")) # batch_size # Calculate least-square loss with tf.name_scope("loss"): self.labels = tf.reshape(self.input_y, [-1, 1]) losses = tf.reduce_sum((self.scores - self.labels) * (self.scores - self.labels), 1) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * self.l2_loss self.params = [param for param in tf.trainable_variables() if 'rewarder' in param.name] d_optimizer = tf.train.AdamOptimizer(self.dis_learning_rate) grads_and_vars = d_optimizer.compute_gradients(self.loss, self.params, aggregation_method=2) self.train_op = d_optimizer.apply_gradients(grads_and_vars)
def __init__(self, num_symbols, num_embed_units, num_units, is_train, vocab=None, content_pos=None, rhetoric_pos = None, embed=None, learning_rate=0.1, learning_rate_decay_factor=0.9995, max_gradient_norm=5.0, max_length=30, latent_size=128, use_lstm=False, num_classes=3, full_kl_step=80000, mem_slot_num=4, mem_size=128): self.ori_sents = tf.placeholder(tf.string, shape=(None, None)) self.ori_sents_length = tf.placeholder(tf.int32, shape=(None)) self.rep_sents = tf.placeholder(tf.string, shape=(None, None)) self.rep_sents_length = tf.placeholder(tf.int32, shape=(None)) self.labels = tf.placeholder(tf.float32, shape=(None, num_classes)) self.use_prior = tf.placeholder(tf.bool) self.global_t = tf.placeholder(tf.int32) self.content_mask = tf.reduce_sum(tf.one_hot(content_pos, num_symbols, 1.0, 0.0), axis = 0) self.rhetoric_mask = tf.reduce_sum(tf.one_hot(rhetoric_pos, num_symbols, 1.0, 0.0), axis = 0) topic_memory = tf.zeros(name="topic_memory", dtype=tf.float32, shape=[None, mem_slot_num, mem_size]) w_topic_memory = tf.get_variable(name="w_topic_memory", dtype=tf.float32, initializer=tf.random_uniform([mem_size, mem_size], -0.1, 0.1)) # build the vocab table (string to index) if is_train: self.symbols = tf.Variable(vocab, trainable=False, name="symbols") else: self.symbols = tf.Variable(np.array(['.']*num_symbols), name="symbols") self.symbol2index = HashTable(KeyValueTensorInitializer(self.symbols, tf.Variable(np.array([i for i in range(num_symbols)], dtype=np.int32), False)), default_value=UNK_ID, name="symbol2index") self.ori_sents_input = self.symbol2index.lookup(self.ori_sents) self.rep_sents_target = self.symbol2index.lookup(self.rep_sents) batch_size, decoder_len = tf.shape(self.rep_sents)[0], tf.shape(self.rep_sents)[1] self.rep_sents_input = tf.concat([tf.ones([batch_size, 1], dtype=tf.int32)*GO_ID, tf.split(self.rep_sents_target, [decoder_len-1, 1], 1)[0]], 1) self.decoder_mask = tf.reshape(tf.cumsum(tf.one_hot(self.rep_sents_length-1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.pattern_embed = tf.get_variable('pattern_embed', [num_classes, num_embed_units], tf.float32) self.encoder_input = tf.nn.embedding_lookup(self.embed, self.ori_sents_input) self.decoder_input = tf.nn.embedding_lookup(self.embed, self.rep_sents_input) if use_lstm: cell_fw = LSTMCell(num_units) cell_bw = LSTMCell(num_units) cell_dec = LSTMCell(2*num_units) else: cell_fw = GRUCell(num_units) cell_bw = GRUCell(num_units) cell_dec = GRUCell(2*num_units) # origin sentence encoder with variable_scope.variable_scope("encoder"): encoder_output, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.encoder_input, self.ori_sents_length, dtype=tf.float32) post_sum_state = tf.concat(encoder_state, 1) encoder_output = tf.concat(encoder_output, 2) # response sentence encoder with variable_scope.variable_scope("encoder", reuse = True): decoder_state, decoder_last_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.decoder_input, self.rep_sents_length, dtype=tf.float32) response_sum_state = tf.concat(decoder_last_state, 1) # recognition network with variable_scope.variable_scope("recog_net"): recog_input = tf.concat([post_sum_state, response_sum_state], 1) recog_mulogvar = tf.contrib.layers.fully_connected(recog_input, latent_size * 2, activation_fn=None, scope="muvar") recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=1) # prior network with variable_scope.variable_scope("prior_net"): prior_fc1 = tf.contrib.layers.fully_connected(post_sum_state, latent_size * 2, activation_fn=tf.tanh, scope="fc1") prior_mulogvar = tf.contrib.layers.fully_connected(prior_fc1, latent_size * 2, activation_fn=None, scope="muvar") prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=1) latent_sample = tf.cond(self.use_prior, lambda: sample_gaussian(prior_mu, prior_logvar), lambda: sample_gaussian(recog_mu, recog_logvar)) # classifier with variable_scope.variable_scope("classifier"): classifier_input = latent_sample pattern_fc1 = tf.contrib.layers.fully_connected(classifier_input, latent_size, activation_fn=tf.tanh, scope="pattern_fc1") self.pattern_logits = tf.contrib.layers.fully_connected(pattern_fc1, num_classes, activation_fn=None, scope="pattern_logits") self.label_embedding = tf.matmul(self.labels, self.pattern_embed) output_fn, my_sequence_loss = output_projection_layer(2*num_units, num_symbols, latent_size, num_embed_units, self.content_mask, self.rhetoric_mask) attention_keys, attention_values, attention_score_fn, attention_construct_fn = my_attention_decoder_fn.prepare_attention(encoder_output, 'luong', 2*num_units) with variable_scope.variable_scope("dec_start"): temp_start = tf.concat([post_sum_state, self.label_embedding, latent_sample], 1) dec_fc1 = tf.contrib.layers.fully_connected(temp_start, 2*num_units, activation_fn=tf.tanh, scope="dec_start_fc1") dec_fc2 = tf.contrib.layers.fully_connected(dec_fc1, 2*num_units, activation_fn=None, scope="dec_start_fc2") if is_train: # rnn decoder topic_memory = self.update_memory(topic_memory, encoder_output) extra_info = tf.concat([self.label_embedding, latent_sample, topic_memory], 1) decoder_fn_train = my_attention_decoder_fn.attention_decoder_fn_train(dec_fc2, attention_keys, attention_values, attention_score_fn, attention_construct_fn, extra_info) self.decoder_output, _, _ = my_seq2seq.dynamic_rnn_decoder(cell_dec, decoder_fn_train, self.decoder_input, self.rep_sents_length, scope = "decoder") # calculate the loss self.decoder_loss = my_loss.sequence_loss(logits = self.decoder_output, targets = self.rep_sents_target, weights = self.decoder_mask, extra_information = latent_sample, label_embedding = self.label_embedding, softmax_loss_function = my_sequence_loss) temp_klloss = tf.reduce_mean(gaussian_kld(recog_mu, recog_logvar, prior_mu, prior_logvar)) self.kl_weight = tf.minimum(tf.to_float(self.global_t)/full_kl_step, 1.0) self.klloss = self.kl_weight * temp_klloss temp_labels = tf.argmax(self.labels, 1) self.classifierloss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.pattern_logits, labels=temp_labels)) self.loss = self.decoder_loss + self.klloss + self.classifierloss # need to anneal the kl_weight # building graph finished and get all parameters self.params = tf.trainable_variables() # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # calculate the gradient of parameters opt = tf.train.MomentumOptimizer(self.learning_rate, 0.9) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) else: # rnn decoder topic_memory = self.update_memory(topic_memory, encoder_output) extra_info = tf.concat([self.label_embedding, latent_sample, topic_memory], 1) decoder_fn_inference = my_attention_decoder_fn.attention_decoder_fn_inference(output_fn, dec_fc2, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols, extra_info) self.decoder_distribution, _, _ = my_seq2seq.dynamic_rnn_decoder(cell_dec, decoder_fn_inference, scope="decoder") self.generation_index = tf.argmax(tf.split(self.decoder_distribution, [2, num_symbols-2], 2)[1], 2) + 2 # for removing UNK self.generation = tf.nn.embedding_lookup(self.symbols, self.generation_index) self.params = tf.trainable_variables() self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, vocab=None, embed=None, name_scope=None, learning_rate=0.001, learning_rate_decay_factor=0.95, max_gradient_norm=5, num_samples=512, max_length=30): self.posts = tf.placeholder(tf.string, shape=[None, None]) # batch * len self.posts_length = tf.placeholder(tf.int32, shape=[None]) # batch self.responses = tf.placeholder(tf.string, shape=[None, None]) # batch*len self.responses_length = tf.placeholder(tf.int32, shape=[None]) # batch self.weight = tf.placeholder(tf.float32, shape=[None]) # batch # build the vocab table (string to index) self.symbols = tf.Variable(vocab, trainable=False, name="symbols") self.symbol2index = HashTable(KeyValueTensorInitializer( self.symbols, tf.Variable( np.array([i for i in range(num_symbols)], dtype=np.int32), False)), default_value=UNK_ID, name="symbol2index") # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.posts_input = self.symbol2index.lookup( self.posts) # batch * utter_len self.encoder_input = tf.nn.embedding_lookup( self.embed, self.posts_input) # batch * utter_len * embed_unit self.responses_target = self.symbol2index.lookup( self.responses) # batch, len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape( self.responses)[1] self.responses_input = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int32) * GO_ID, tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0] ], 1) # batch, len self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # batch, len self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_input) # Construct multi-layer GRU cells for encoder and decoder cell_enc = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) cell_dec = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # Encode the post sequence encoder_output, encoder_state = tf.nn.dynamic_rnn(cell_enc, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") output_fn, sampled_sequence_loss = output_projection_layer( num_units, num_symbols, num_samples) attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = my_attention_decoder_fn.prepare_attention(encoder_output, 'bahdanau', num_units) # Decode the response sequence (Training) with variable_scope.variable_scope('decoder'): decoder_fn_train = my_attention_decoder_fn.attention_decoder_fn_train( encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn) self.decoder_output, _, _ = my_seq2seq.dynamic_rnn_decoder( cell_dec, decoder_fn_train, self.decoder_input, self.responses_length, scope='decoder_rnn') self.decoder_loss = my_loss.sequence_loss( self.decoder_output, self.responses_target, self.decoder_mask, softmax_loss_function=sampled_sequence_loss) self.weighted_decoder_loss = self.decoder_loss * self.weight attention_keys_infer, attention_values_infer, attention_score_fn_infer, attention_construct_fn_infer \ = my_attention_decoder_fn.prepare_attention(encoder_output, 'bahdanau', num_units, reuse = True) # Decode the response sequence (Inference) with variable_scope.variable_scope('decoder', reuse=True): decoder_fn_inference = my_attention_decoder_fn.attention_decoder_fn_inference( output_fn, encoder_state, attention_keys_infer, attention_values_infer, attention_score_fn_infer, attention_construct_fn_infer, self.embed, GO_ID, EOS_ID, max_length, num_symbols) self.decoder_distribution, _, _ = my_seq2seq.dynamic_rnn_decoder( cell_dec, decoder_fn_inference, scope='decoder_rnn') self.generation_index = tf.argmax( tf.split(self.decoder_distribution, [2, num_symbols - 2], 2)[1], 2) + 2 # for removing UNK self.generation = tf.nn.embedding_lookup(self.symbols, self.generation_index) self.params = [ k for k in tf.trainable_variables() if name_scope in k.name ] # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.adv_global_step = tf.Variable(0, trainable=False) # calculate the gradient of parameters self.cost = tf.reduce_mean(self.weighted_decoder_loss) self.unweighted_cost = tf.reduce_mean(self.decoder_loss) opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.cost, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) all_variables = [ k for k in tf.global_variables() if name_scope in k.name ] self.saver = tf.train.Saver(all_variables, write_version=tf.train.SaverDef.V2, max_to_keep=5, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.adv_saver = tf.train.Saver(all_variables, write_version=tf.train.SaverDef.V2, max_to_keep=5, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def cell_fn(): return GRUCell(RNN_nodes)