def recurrent_neural_network(train_input): layer = { 'weights': tf.Variable(tf.random_normal([n_hidden, n_classes])), 'biases': tf.Variable(tf.random_normal([n_classes])) } train_input = tf.unstack(train_input, seq_max_len, 1) print(train_input) lstm_cell = core_rnn_cell.BasicLSTMCell(rnn_size) outputs, states = rnn.static_rnn(lstm_cell, train_input, dtype=tf.float32, sequence_length=seqlen) outputs = tf.stack(outputs) outputs = tf.transpose(outputs, [1, 0, 2]) # Hack to build the indexing and retrieve the right output. batch_size = tf.shape(outputs)[0] # Start indices for each sample index = tf.range(0, batch_size) * seq_max_len + (seqlen - 1) # Indexing outputs = tf.gather(tf.reshape(outputs, [-1, n_hidden]), index) output = tf.matmul(outputs, layer['weights']) + layer['biases'] # print(output) return output
def recurrent_neural_network(train_input): layer = { 'weights': tf.Variable(tf.random_normal([rnn_size, n_classes])), 'biases': tf.Variable(tf.random_normal([n_classes])) } train_input = tf.transpose(train_input, [1, 0, 2]) train_input = tf.reshape(train_input, [-1, chunk_size]) train_input = tf.split(train_input, n_chunks, 0) lstm_cell = core_rnn_cell.BasicLSTMCell(rnn_size, state_is_tuple=True) outputs, states = rnn.static_rnn(lstm_cell, train_input, dtype=tf.float32) output = tf.matmul(outputs[-1], layer['weights']) + layer['biases'] return output
def seq_predict_model(X, w, b, time_step_size, vector_size): # 数组转置函数 # X转为:[time_step_size,batch_size,vector_size] X = tf.transpose(X, [1, 0, 2]) # 调整tensor X的维度 -1表示不指定维度 # X最终的shape为:[time_step_size*batch_size, vector_size] X = tf.reshape(X, [-1, vector_size]) # 以第0维度,把X分为time_step_size份,切分后的shape为[batch_size, vector_size] X = tf.split(X, time_step_size, 0) cell = core_rnn_cell.BasicLSTMCell(num_units=10, forget_bias=1.0, state_is_tuple=True) outputs, _states = core_rnn.static_rnn(cell, X, dtype=tf.float32) return tf.matmul(outputs[-1], w) + b, cell.state_size
def seq_predict_model(X, w, b, time_step_size, vector_size): # input X shape: [batch_size, time_step_size, vector_size] # transpose X to [time_step_size, batch_size, vector_size] X = tf.transpose(X, [1, 0, 2]) # reshape X to [time_step_size * batch_size, vector_size] X = tf.reshape(X, [-1, vector_size]) # split X, array[time_step_size], shape: [batch_size, vector_size] X = tf.split(X, time_step_size, 0) # LSTM model with state_size = 10 cell = core_rnn_cell.BasicLSTMCell(num_units=10, forget_bias=1.0, state_is_tuple=True) outputs, _states = core_rnn.static_rnn(cell, X, dtype=tf.float32) # Linear activation return tf.matmul(outputs[-1], w) + b, cell.state_size
def __init__(self, is_training, config, input_): self._input = input_ batch_size = input_.batch_size num_steps = input_.num_steps size = config.hidden_size vocab_size = config.vocab_size # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper. lstm_cell = core_rnn_cell.BasicLSTMCell(num_units=size, state_is_tuple=True) if is_training and config.keep_prob < 1: lstm_cell = tf.contrib.rnn.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = core_rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=True) self._initial_state = cell.zero_state(batch_size, data_type()) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size], dtype=data_type()) inputs = tf.nn.embedding_lookup(embedding, input_.input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) outputs = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) output = tf.reshape(tf.concat(outputs, 1), [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size], dtype=data_type()) softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) logits = tf.matmul(output, softmax_w) + softmax_b loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( [logits], [tf.reshape(input_.targets, [-1])], [tf.ones([batch_size * num_steps], dtype=data_type())]) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = state if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) self._new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr)
def __init__(self, pre_trained_seq2seq, pre_trained_backward, vocab_size, buckets, layer_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only = False, dtype= tf.float32): """Create a Model: Similar to the seq2seq_model_rl.py code but it has differences in: - loss function - INPUTS: vocab_size: size of vocabulary buckets: a list of pairs (I,O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Traning instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted. ** We may not use bucketing for Dialogue. layer_size: the number of units in each layer num_layers: the number of the layers in the model max_gradient_norm : gradients will be clipped to maximally this norm? candidate_size : the number of candidates (actions) learning_rate : learning rate to start with. learning_rate_decay_factor : decay learning rate by this much when needed. use_lstm: True -> LSTM cells, False -> GRU cells num_samples: the number of samples for sampled softmax forward_only : if set, we do not construct the backward pass in the model dtype: the data type to use to store internal variables. """ self.vocab_size = vocab_size self.buckets = buckets self.buckets_back = [(x[1],x[1]) for x in buckets] self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype = dtype) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate*learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.pre_trained_seq2seq = pre_trained_seq2seq self.pre_trained_backward = pre_trained_backward #self.bucket_id = tf.placeholder(tf.int32, shape=(2,), name="bucket_id") # [bucket_id, 0] self.bucket_id = 0 # Variables w_t = tf.get_variable("proj_w",[self.vocab_size, layer_size], dtype = dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.vocab_size], dtype=dtype) output_projection = (w,b) if use_lstm: single_cell = core_rnn_cell.BasicLSTMCell(layer_size) else: single_cell = core_rnn_cell.GRUCell(layer_size) if num_layers > 1: cell = core_rnn_cell.MultiRNNCell([single_cell]*num_layers) else: cell = single_cell def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols = vocab_size, num_decoder_symbols=vocab_size, embedding_size = layer_size, output_projection = output_projection, feed_previous = do_decode, dtype = dtype) self.states, self.states_back, self.action_dums = [], [], [] # states_back : the 2nd half of the states (each) self.actions , self.actions_back = [], [] self.weights, self.weights_back = [],[] for i in xrange(self.buckets[-1][0]): self.states.append(tf.placeholder(tf.int32, shape=[None], name ="state{0}".format(i))) for i in xrange(self.buckets[-1][1]): self.action_dums.append(tf.placeholder(tf.int32, shape=[None], name ="action_dum{0}".format(i))) self.actions.append(tf.placeholder(tf.int32, shape=[None], name ="action{0}".format(i))) self.weights.append(tf.placeholder(dtype, shape=[None], name ="weight_rl{0}".format(i))) for i in xrange(self.buckets_back[-1][0]): self.actions_back.append(tf.placeholder(tf.int32, shape=[None], name ="action_back{0}".format(i))) for i in xrange(self.buckets_back[-1][1]): self.states_back.append(tf.placeholder(tf.int32, shape=[None], name ="state_back{0}".format(i))) for i in xrange(self.buckets[-1][1]): self.weights_back.append(tf.placeholder(dtype, shape=[None], name="weight_rl_back{0}".format(i))) # 1. Get batch actions #>>self.actions, self.actions_back, self.weights, self.joint_logits = self.generate_batch_action(self.states, self.action_dums, self.bucket_id, lambda x,y:seq2seq_f(x,y,True), output_projection= output_projection) self.actions_sam, self.logprob = self.generate_batch_action(self.states, self.action_dums, self.bucket_id, lambda x,y:seq2seq_f(x,y,True), output_projection= output_projection) # 2. Get the loss def mi_score(states, actions, weights, states_back, actions_back, weights_back): """ Args # states, states_back, weights_back : placeholder # actions, actions_back, weights : from generate_batch_action """ #self.feeding_data(self.pre_trained_seq2seq, self.buckets, states, actions, weights) #self.feeding_data(self.pre_trained_backward, self.buckets_back, actions_back, states_back, weights_back) #output_logits = tf.slice(tf.constant(output_logits, dtype=tf.float32), self.bucket_id, [1,-1]) # if self.bucket_id < (len(self.buckets)-1): # for i in xrange(self.buckets[-1][1]-self.buckets[self.bucket_id][1]): # actions.append(tf.placeholder(tf.int32, shape=[None], name="action{0}".format(i+self.buckets[self.bucket_id][1]))) # weights.append(tf.placeholder(tf.int32, shape=[None], name="weight_rl{0}".format(i+self.buckets[self.bucket_id][1]))) # with tf.variable_scope("forward", reuse=True) as scope: # scope.reuse_variables() # output_logits,_ = tf.contrib.legacy_seq2seq.model_with_buckets(states, actions, actions[0:],weights, self.buckets, lambda x,y: self.pre_trained_seq2seq.seq2seq_f(x,y,True), softmax_loss_function=self.pre_trained_seq2seq.softmax_loss_function) output_logits = self.pre_trained_seq2seq.outputs[self.bucket_id] #output_logprob = [-tf.log(tf.ones(shape = (self.batch_size, self.vocab_size), dtype=tf.float32) + tf.exp(-logit)) for logit in output_logits] log_prob = [] logprob_s2s = tf.nn.log_softmax(output_logits,dim=0) for word_idx in xrange(self.buckets[self.bucket_id][1]): one_hot_mat = tf.one_hot(actions[word_idx],depth=self.vocab_size, on_value = 1.0, off_value=0.0, axis =1, dtype=tf.float32 ) tmp1 = tf.reshape(tf.slice(logprob_s2s, [word_idx,0,0],[1,-1,-1]), shape = (self.batch_size, self.vocab_size)) log_prob_word = tf.subtract(tf.reduce_sum(tf.multiply(tmp1 , one_hot_mat),1), tf.log(tf.reduce_sum(tf.exp(tmp1),1))) log_prob.append(tf.multiply(log_prob_word, weights[word_idx])) output_logits_back = self.pre_trained_backward.outputs[self.bucket_id] #output_logprob_back = [-tf.log(tf.ones(shape = (self.batch_size, self.vocab_size), dtype=tf.float32) + tf.exp(-logit)) for logit in output_logits_back] log_prob_back = [] logprob_back = tf.nn.log_softmax(output_logits_back,dim=0) w_back_new = [np.ones(self.batch_size, dtype = np.float32)] + weights_back[:-1] for word_idx in xrange(self.buckets_back[self.bucket_id][1]): one_hot_mat = tf.one_hot(states_back[word_idx],depth=self.vocab_size, on_value = 1.0, off_value=0.0, axis =1, dtype=tf.float32 ) tmp2 = tf.reshape(tf.slice(logprob_back, [word_idx,0,0],[1,-1,-1]), shape = (self.batch_size, self.vocab_size)) log_prob_word = tf.subtract(tf.reduce_sum(tf.multiply(tmp2 , one_hot_mat),1), tf.log(tf.reduce_sum(tf.exp(tmp2),1))) log_prob_back.append(tf.multiply(log_prob_word, w_back_new[word_idx])) return tf.divide(tf.add_n(log_prob), tf.add_n(weights[:self.buckets[self.bucket_id][1]])) + tf.divide(tf.add_n(log_prob_back), tf.add_n(w_back_new[:self.buckets_back[self.bucket_id][1]])) #+ tf.constant(20.0, shape=(self.batch_size,), dtype = tf.float32) if not forward_only: self.neg_penalty = tf.placeholder(tf.float32, shape=[None], name="neg_penalty") #repeat_penalty(self.actions) self.reward = mi_score(self.states, self.actions, self.weights, self.states_back, self.actions_back, self.weights_back) + tf.scalar_mul(tf.constant(0.05,shape=()), tf.add_n(self.weights[:self.buckets[self.bucket_id][1]])) joint_logprob = tf.reduce_sum(self.logprob,axis=0) # 3. Gradient Descent Optimization params = [x for x in tf.trainable_variables() if "mi" in str(x.name).split("/")] cost = tf.scalar_mul(tf.constant(-1.0,shape=()), tf.add(self.neg_penalty, self.reward)) #tf.add(self.neg_penalty, self.reward) opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(tf.matmul(tf.reshape(cost, shape=(self.batch_size,1)), tf.reshape(joint_logprob,shape=(self.batch_size,1)), transpose_a=True), params) clipped_gradients, global_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) # Clips values of multiple tensors by the ratio of the sum of their norms. self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) #An Operation that applies the specified gradients. If global_step was not None, that operation also increments global_step. self.names = {str(x.name).split(":0")[0] : x for x in tf.global_variables() if 'mi' in str(x.name).split("/")} self.saver = tf.train.Saver(self.names)
def __init__(self, vocab_size, buckets, layer_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, MI_use=False, forward_only=False, dtype=tf.float32): """Create a Model: Similar to the seq2seq_model.py code in the tensorflow version 0.12.1 INPUTS: vocab_size: size of vocabulary buckets: a list of pairs (I,O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Traning instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted. ** We may not use bucketing for Dialogue. layer_size: the number of units in each layer num_layers: the number of the layers in the model max_gradient_norm : gradients will be clipped to maximally this norm? batch_size : the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate : learning rate to start with. learning_rate_decay_factor : decay learning rate by this much when needed. use_lstm: True -> LSTM cells, False -> GRU cells num_samples: the number of samples for sampled softmax forward_only : if set, we do not construct the backward pass in the model dtype: the data type to use to store internal variables. """ self.vocab_size = vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.vocab_size: w_t = tf.get_variable("proj_w", [self.vocab_size, layer_size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss( labels, inputs ): # The order is opposite to the order in 0.12.x version!!! What the hell? labels = tf.reshape(labels, [-1, 1]) # -1 makes it 1-D. # We need to compute the sampled_softmax_loss using 32bit flotas to avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) # tf.nn -> <module 'tensorflow.python.ops.nn' from 'PATH/tensorflow/python/ops/nn.pyc'> return tf.cast( tf.nn.sampled_softmax_loss(weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_samples, num_classes=self.vocab_size), dtype) softmax_loss_function = sampled_loss self.softmax_loss_function = softmax_loss_function # Create the internal multi-layer cell for our RNN. if use_lstm: single_cell = core_rnn_cell.BasicLSTMCell(layer_size) else: single_cell = core_rnn_cell.GRUCell(layer_size) if num_layers > 1: cell = core_rnn_cell.MultiRNNCell([single_cell] * num_layers) else: cell = single_cell # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=vocab_size, num_decoder_symbols=vocab_size, embedding_size=layer_size, output_projection=output_projection, feed_previous=do_decode, dtype=dtype) self.seq2seq_f = seq2seq_f # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format( i))) # "encoder{0}".format(N) -> 'encoderN' for i in xrange(buckets[-1][1] + 1): # For EOS self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # (i+1) because of GO symbol at the beginning # Training outputs and losses (a list(len(buckets) of 1-D batched size tensors) if forward_only: self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) params = tf.trainable_variables( ) # Returns all variables created with trainable=True if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, global_norm = tf.clip_by_global_norm( gradients, max_gradient_norm ) # Clips values of multiple tensors by the ratio of the sum of their norms. self.gradient_norms.append(global_norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) ) #An Operation that applies the specified gradients. If global_step was not None, that operation also increments global_step. if MI_use: self.names = { str(x.name).split(":0")[0]: x for x in tf.global_variables() if 'forward' in str(x.name).split("/") } self.saver = tf.train.Saver(self.names) else: self.saver = tf.train.Saver(tf.global_variables())
def __init__(self, sess, pre_trained_seq2seq, pre_trained_backward, vocab_size, buckets, layer_size, num_layers, max_gradient_norm, candidate_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only = False, dtype= tf.float32): """Create a Model: Similar to the seq2seq_model_rl.py code but it has differences in: - loss function - INPUTS: vocab_size: size of vocabulary buckets: a list of pairs (I,O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Traning instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted. ** We may not use bucketing for Dialogue. layer_size: the number of units in each layer num_layers: the number of the layers in the model max_gradient_norm : gradients will be clipped to maximally this norm? candidate_size : the number of candidates (actions) learning_rate : learning rate to start with. learning_rate_decay_factor : decay learning rate by this much when needed. use_lstm: True -> LSTM cells, False -> GRU cells num_samples: the number of samples for sampled softmax forward_only : if set, we do not construct the backward pass in the model dtype: the data type to use to store internal variables. """ self.sess = sess self.vocab_size = vocab_size self.buckets = buckets self.buckets_back = [(x[1],x[1]) for x in buckets] self.batch_size = """? necessary?""" self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype = dtype) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate*learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.pre_trained_seq2seq = pre_trained_seq2seq self.pre_trained_backward = pre_trained_backward self.bucket_id = len(buckets)-1 if num_samples > 0 and num_samples < self.vocab_size: w_t = tf.get_variable("proj_w_mi",[self.vocab_size, layer_size], dtype = dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b_mi", [self.vocab_size], dtype=dtype) output_projection = (w,b) """ def mi_score(states, actions, weights, states_back, actions_back, weights_back, bucket_id): #Args: # states:[first utterance, second utterance] # actions: action utterance pdb.set_trace() #bucket_id = min([b for b in xrange(len(self.buckets)) if self.buckets[b][0] > len(states)]) states_input = self.sess.run() _, _, output_logits = self.pre_trained_seq2seq.step(self.sess, states, actions, weights, bucket_id, True) # output_logits: log_prob = [] for word_idx in xrange(len(actions)): tmp = [output_logits[word_idx][batch_idx][actions[word_idx][batch_idx]] - np.log(sum(np.exp(output_logits[word_idx][batch_idx]))) for batch_idx in xrange(batch_size)] log_prob.append(np.inner(tmp, weights[word_idx])) #bucket_id_back = min([b for b in xrange(len(self.buckets_back)) if self.buckets_back[b][0] > len(states_back)]) _, _, output_logits_back = self.pre_trained_backward.step(self.sess, actions_back, states_back, weights_back, bucket_id, True) log_prob_back = [] for word_idx in xrange(len(states_back)): tmp = [output_logits_back[word_idx][batch_idx][states_back[word_idx][batch_idx]] - np.log(sum(np.exp(output_logits_back[word_idx][batch_idx]))) for batch_idx in xrange(batch_size)] log_prob_back.append(np.inner(tmp, weights_back[word_idx])) # -log_prob/float(len(action)) - log_prob_back/float(len(state[1])) return -sum(log_prob)/float(len(actions)) - log_prob_back/float(len(states_back)) loss_function = mi_score """ if use_lstm: single_cell = core_rnn_cell.BasicLSTMCell(layer_size) else: single_cell = core_rnn_cell.GRUCell(layer_size) if num_layers > 1: cell = core_rnn_cell.MultiRNNCell([single_cell]*num_layers) else: cell = single_cell def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols = vocab_size, num_decoder_symbols=vocab_size, embedding_size = layer_size, output_projection = output_projection, feed_previous = do_decode, dtype = dtype) self.seq2seq_f = seq2seq_f self.states, self.states_back = [], [] self.actions , self.actions_back = [], [] self.weights, self.weights_back = [], [] for i in xrange(self.buckets[-1][0]): self.states.append(tf.placeholder(tf.int32, shape=[None], name ="state{0}".format(i))) for i in xrange(self.buckets_back[-1][1]): self.states_back.append(tf.placeholder(tf.int32, shape=[None], name ="state_back{0}".format(i))) for i in xrange(self.buckets[-1][1]): self.actions.append(tf.placeholder(tf.int32, shape=[None], name ="action{0}".format(i))) self.actions_back.append(tf.placeholder(tf.int32, shape=[None], name ="action_back{0}".format(i))) self.weights.append(tf.placeholder(dtype, shape=[None], name="weight_rl{0}".format(i))) self.weights_back.append(tf.placeholder(dtype, shape=[None], name="weight_rl_back{0}".format(i))) #self.losses = loss_function(self.states, self.actions, self.weights, self.states_back, self.actions_back, self.weights_back, self.bucket_id) self.losses = [] for i in xrange(len(buckets)): self.losses.append(tf.placeholder(tf.float32, shape = [None], name = "losses{0}".format(i))) params = tf.trainable_variables() pdb.set_trace() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b],params) clipped_gradients, global_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) # Clips values of multiple tensors by the ratio of the sum of their norms. self.gradient_norms.append(global_norm) self.updates.append(opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) #An Operation that applies the specified gradients. If global_step was not None, that operation also increments global_step. #self.updates.append(opt.minimize(self.losses[b],params)) self.saver = tf.train.Saver(tf.global_variables())
def build_model(self): self.x = tf.placeholder(tf.int32, [self.batch_size, self.XMAXLEN], name="premise") self.x_length = tf.placeholder(tf.int32, [self.batch_size], name="premise_len") self.y = tf.placeholder(tf.int32, [self.batch_size, self.YMAXLEN], name="hypothesis") self.y_length = tf.placeholder(tf.int32, [self.batch_size], name="hyp_len") self.target = tf.placeholder( tf.float32, [self.batch_size, 3], name="label") # change this to int32 and it breaks. # DO NOT DO THIS # self.batch_size = tf.shape(self.x)[0] # batch size # self.x_length = tf.shape(self.x)[1] # batch size # print self.batch_size,self.x_length self.embed_matrix = tf.get_variable("embeddings", [self.vocab_size, self.dim]) self.x_emb = tf.nn.embedding_lookup(self.embed_matrix, self.x) self.y_emb = tf.nn.embedding_lookup(self.embed_matrix, self.y) print(self.x_emb, self.y_emb) with tf.variable_scope("encode_x"): self.fwd_lstm = core_rnn_cell.BasicLSTMCell(self.h_dim, state_is_tuple=True) self.x_output, self.x_state = tf.nn.dynamic_rnn(cell=self.fwd_lstm, inputs=self.x_emb, dtype=tf.float32) # self.x_output, self.x_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=self.fwd_lstm,cell_bw=self.bwd_lstm,inputs=self.x_emb,dtype=tf.float32) print(self.x_output) # print self.x_state # print tf.shape(self.x) with tf.variable_scope("encode_y"): self.fwd_lstm = core_rnn_cell.BasicLSTMCell(self.h_dim, state_is_tuple=True) self.y_output, self.y_state = tf.nn.dynamic_rnn( cell=self.fwd_lstm, inputs=self.y_emb, initial_state=self.x_state, dtype=tf.float32) # print self.y_output # print self.y_state self.Y = self.x_output # its length must be x_length # self.h_n = self.last_relevant(self.y_output,self.x_length) # TODO tmp5 = tf.transpose(self.y_output, [1, 0, 2]) self.h_n = tf.gather(tmp5, int(tmp5.get_shape()[0]) - 1) print(self.h_n) # self.h_n_repeat = self.repeat(self.h_n,self.x_length) # TODO self.h_n_repeat = tf.expand_dims(self.h_n, 1) pattern = tf.stack([1, self.XMAXLEN, 1]) self.h_n_repeat = tf.tile(self.h_n_repeat, pattern) self.W_Y = tf.get_variable("W_Y", shape=[self.h_dim, self.h_dim]) self.W_h = tf.get_variable("W_h", shape=[self.h_dim, self.h_dim]) # TODO compute M = tanh(W*Y + W*[h_n...]) tmp1 = tf.matmul(tf.reshape( self.Y, shape=[self.batch_size * self.XMAXLEN, self.h_dim]), self.W_Y, name="Wy") self.Wy = tf.reshape(tmp1, shape=[self.batch_size, self.XMAXLEN, self.h_dim]) tmp2 = tf.matmul( tf.reshape(self.h_n_repeat, shape=[self.batch_size * self.XMAXLEN, self.h_dim]), self.W_h) self.Whn = tf.reshape( tmp2, shape=[self.batch_size, self.XMAXLEN, self.h_dim], name="Whn") self.M = tf.tanh(tf.add(self.Wy, self.Whn), name="M") # print "M",self.M # use attention self.W_att = tf.get_variable("W_att", shape=[self.h_dim, 1]) # h x 1 tmp3 = tf.matmul( tf.reshape(self.M, shape=[self.batch_size * self.XMAXLEN, self.h_dim]), self.W_att) # need 1 here so that later can do multiplication with h x L self.att = tf.nn.softmax( tf.reshape(tmp3, shape=[self.batch_size, 1, self.XMAXLEN], name="att")) # nb x 1 x Xmax # print "att",self.att # COMPUTE WEIGHTED self.r = tf.reshape(tf.matmul(self.att, self.Y, name="r"), shape=[self.batch_size, self.h_dim ]) # (nb,1,L) X (nb,L,k) = (nb,1,k) # get last step of Y as r which is (batch,k) # tmp4 = tf.transpose(self.Y, [1, 0, 2]) # self.r = tf.gather(tmp4, int(tmp4.get_shape()[0]) - 1) # print "r",self.r self.W_p, self.b_p = tf.get_variable( "W_p", shape=[self.h_dim, self.h_dim ]), tf.get_variable("b_p", shape=[self.h_dim], initializer=tf.constant_initializer()) self.W_x, self.b_x = tf.get_variable( "W_x", shape=[self.h_dim, self.h_dim ]), tf.get_variable("b_x", shape=[self.h_dim], initializer=tf.constant_initializer()) self.Wpr = tf.matmul(self.r, self.W_p, name="Wy") + self.b_p self.Wxhn = tf.matmul(self.h_n, self.W_x, name="Wxhn") + self.b_x self.hstar = tf.tanh(tf.add(self.Wpr, self.Wxhn), name="hstar") # print "Wpr",self.Wpr # print "Wxhn",self.Wxhn # print "hstar",self.hstar self.W_pred = tf.get_variable("W_pred", shape=[self.h_dim, 3]) self.pred = tf.nn.softmax(tf.matmul(self.hstar, self.W_pred), name="pred_layer") # print "pred",self.pred,"target",self.target correct = tf.equal(tf.argmax(self.pred, 1), tf.argmax(self.target, 1)) self.acc = tf.reduce_mean(tf.cast(correct, "float"), name="accuracy") # self.H_n = self.last_relevant(self.en_output) self.loss = -tf.reduce_sum(self.target * tf.log(self.pred), name="loss") # print self.loss self.optimizer = tf.train.AdamOptimizer() self.optim = self.optimizer.minimize(self.loss, var_list=tf.trainable_variables()) _ = tf.summary.scalar("loss", self.loss)