def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper. lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self._initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) # Simplified version of tensorflow.models.rnn.rnn.py's rnn(). # This builds an unrolled LSTM for tutorial purposes only. # In general, use the rnn() or state_saving_rnn() from rnn.py. # # The alternative version of the code below is: # # from tensorflow.models.rnn import rnn # inputs = [tf.squeeze(input_, [1]) # for input_ in tf.split(1, num_steps, inputs)] # outputs, state = rnn.rnn(cell, inputs, initial_state=self._initial_state) outputs = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) output = tf.reshape(tf.concat(1, outputs), [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.matmul(output, softmax_w) + softmax_b loss = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])]) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = state if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
m = PTBModel(is_training=False, config=config) saver = tf.train.Saver() saver.restore(session, "/Users/marting/scratch/tensorflow/model.ckpt") with tf.variable_scope("model", reuse=True): print("Model restored.") embedding = tf.get_variable("embedding", [vocab_size, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) softmax_w = session.run(softmax_w) softmax_b = session.run(softmax_b) embedding = session.run(embedding) nextword = 'food' wordvec = embedding[voc[nextword]] print(wordvec) lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) cell = rnn_cell.MultiRNNCell([lstm_cell] * num_layers) state = cell.zero_state(1, tf.float32) input = tf.convert_to_tensor(wordvec) with tf.variable_scope("RNN"): #state = tf.reshape(state, [1,800]) input = tf.reshape(input, [1, 200]) (cell_output, state) = cell(input, state) cell_output = cell_output.eval() print(cell_output.shape) nextword = cov[np.argmax((cell_output.dot(softmax_w) + softmax_b))] print(nextword) with tf.Graph().as_default(), tf.Session() as session: with tf.variable_scope("model", reuse=True): m = PTBModel(is_training=False, config=config) saver = tf.train.Saver()
def __init__(self, is_training, length): self.batch_size = batch_size = FLAGS.batch_size self.num_steps = num_steps = length hidden_size = FLAGS.hidden_dim self._input_data = tf.placeholder(tf.float32, [batch_size, None, FLAGS.input_dim]) self._targets = tf.placeholder(tf.float32, [batch_size, None, FLAGS.output_dim]) if FLAGS.model == "rnn": vanilla_rnn_cell = rnn_cell.BasicRNNCell(num_units=FLAGS.hidden_dim) if is_training and FLAGS.keep_prob < 1: vanilla_rnn_cell = rnn_cell.DropoutWrapper(vanilla_rnn_cell, output_keep_prob=FLAGS.keep_prob) if FLAGS.layer == 1: cell = vanilla_rnn_cell elif FLAGS.layer == 2: cell = rnn_cell.MultiRNNCell([vanilla_rnn_cell] * 2) elif FLAGS.model == "lstm": lstm_cell = rnn_cell.BasicLSTMCell(num_units=FLAGS.hidden_dim, forget_bias=1.0) if is_training and FLAGS.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=FLAGS.keep_prob) if FLAGS.layer == 1: cell = lstm_cell elif FLAGS.layer == 2: cell = rnn_cell.MultiRNNCell([lstm_cell] * 2) elif FLAGS.model == "gru": gru_cell = rnn_cell.GRUCell(num_units=FLAGS.hidden_dim) if is_training and FLAGS.keep_prob < 1: gru_cell = rnn_cell.DropoutWrapper(gru_cell, output_keep_prob=FLAGS.keep_prob) cell = gru_cell else: raise ValueError("Invalid model: %s", FLAGS.model) self._initial_state = cell.zero_state(batch_size, tf.float32) outputs = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(self._input_data[:, time_step, :], state) outputs.append(cell_output) self._final_state = state hidden_output = tf.reshape(tf.concat(1, outputs), [-1, hidden_size]) V_1 = tf.get_variable("v_1", shape=[hidden_size, FLAGS.output_dim], initializer=tf.random_uniform_initializer(-tf.sqrt(1./hidden_size),tf.sqrt(1./hidden_size))) b_1 = tf.get_variable("b_1", shape=[FLAGS.output_dim], initializer=tf.constant_initializer(0.1)) logits = tf.add(tf.matmul(hidden_output, V_1), b_1) target = tf.reshape(self._targets, [-1, FLAGS.output_dim]) training_loss = tf.reduce_sum(tf.pow(logits-target, 2)) / 2 mse = tf.reduce_mean(tf.pow(logits-target, 2)) self._cost = mse if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(training_loss, tvars), FLAGS.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self): # Input self.point = tf.placeholder(tf.float32, [m, 1], 'points') # Used in training only self.variances = tf.placeholder(tf.float32, [k, 1], 'variances') self.weights = tf.placeholder(tf.float32, [k, 1], 'weights') self.hyperplanes = tf.placeholder( tf.float32, [m, m, k], 'hyperplanes') # Points which define the hyperplanes if rnn_type == 'lstm': self.initial_rnn_state = tf.placeholder_with_default( input=tf.zeros([m, 2 * num_rnn_layers * rnn_size]), shape=[None, 2 * num_rnn_layers * rnn_size]) else: # initial_rnn_state is passed during evaluation but not during training # each dimension has an independent hidden state, required in order to simulate Adam, RMSProp etc. self.initial_rnn_state = tf.placeholder_with_default( input=tf.zeros([m, num_rnn_layers * rnn_size]), shape=[None, num_rnn_layers * rnn_size]) # The scope allows these variables to be excluded from being reinitialized during the comparison phase with tf.variable_scope("optimizer"): if rnn_type == 'rnn': cell = rnn_cell.BasicRNNCell(rnn_size) elif rnn_type == 'gru': cell = rnn_cell.GRUCell(rnn_size) elif rnn_type == 'lstm': cell = rnn_cell.LSTMCell(rnn_size) self.cell = rnn_cell.MultiRNNCell([cell] * num_rnn_layers) updates = [] snf_losses = [] # Arguments passed to the condition and body functions time = tf.constant(0) point = self.point snf_loss = snf.calc_snf_loss_tf(point, self.hyperplanes, self.variances, self.weights) snf_losses.append(snf_loss) snf_grads = snf.calc_grads_tf(snf_loss, point) snf_grads = tf.squeeze(snf_grads, [0]) snf_loss_ta = tf.TensorArray(dtype=tf.float32, size=seq_length) update_ta = tf.TensorArray(dtype=tf.float32, size=seq_length) rnn_state = tf.zeros([m, rnn_size * num_rnn_layers]) loop_vars = [ time, point, snf_grads, rnn_state, snf_loss_ta, update_ta, self.hyperplanes, self.variances, self.weights ] def condition(time, point, snf_grads, rnn_state, snf_loss_ta, update_ta, hyperplanes, variances, weights): return tf.less(time, seq_length) def body(time, point, snf_grads, rnn_state, snf_loss_ta, update_ta, hyperplanes, variances, weights): h, rnn_state_out = self.cell(snf_grads, rnn_state) # Final layer of the optimizer # Cannot use fc_layer due to a 'must be from the same frame' error d = np.sqrt(1.0) / np.sqrt( rnn_size + 1) ### should be sqrt(2, 3 or 6?) initializer = tf.random_uniform_initializer(-d, d) W = tf.get_variable("W", [rnn_size, 1], initializer=initializer) # No bias, linear activation function update = tf.matmul(h, W) update = tf.reshape(update, [m, 1]) update = inv_scale_grads(update) new_point = point + update snf_loss = snf.calc_snf_loss_tf(new_point, hyperplanes, variances, weights) snf_losses.append(snf_loss) snf_loss_ta = snf_loss_ta.write(time, snf_loss) update_ta = update_ta.write(time, update) snf_grads_out = snf.calc_grads_tf(snf_loss, point) snf_grads_out = tf.reshape(snf_grads_out, [m, 1]) time += 1 return [ time, new_point, snf_grads_out, rnn_state_out, snf_loss_ta, update_ta, hyperplanes, variances, weights ] # Do the computation with tf.variable_scope("o1"): res = tf.while_loop(condition, body, loop_vars) self.new_point = res[1] self.rnn_state_out = res[3] losses = res[4].pack() updates = res[5].pack() # Total change in the SNF loss # Improvement: 2 - 3 = -1 (small loss) snf_loss_change = losses[seq_length - 1] - losses[0] snf_loss_change = tf.maximum(snf_loss_change, loss_asymmetry * snf_loss_change) # Asymmetric loss self.loss_change_sign = tf.sign(snf_loss_change) # Oscillation cost overall_update = tf.zeros([m, 1]) norm_sum = 0.0 for i in range(seq_length): overall_update += updates[i, :, :] norm_sum += tf_norm(updates[i, :, :]) osc_cost = norm_sum / tf_norm(overall_update) # > 1 self.total_loss = snf_loss_change * tf.pow( osc_cost, tf.sign(snf_loss_change)) #===# Model training #===# #opt = tf.train.RMSPropOptimizer(0.01,momentum=0.5) opt = tf.train.AdamOptimizer() vars = tf.trainable_variables() gvs = opt.compute_gradients(self.total_loss, vars) self.gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for (grad, var) in gvs] self.grads_input = [(tf.placeholder(tf.float32, shape=v.get_shape()), v) for (g, v) in gvs] self.train_step = opt.apply_gradients(self.grads_input) #===# Comparison code #===# self.input_grads = tf.placeholder( tf.float32, [1, None, 1], 'input_grads') ### Remove first dimension? input_grads = tf.squeeze(self.input_grads, [0]) with tf.variable_scope("o1", reuse=True) as scope: h, self.rnn_state_out_compare = self.cell( input_grads, self.initial_rnn_state) W = tf.get_variable("W") update = tf.matmul(h, W) update = tf.reshape(update, [-1, 1]) self.update = inv_scale_grads(update)
def __init__(self, source_vocab_size, target_vocab_size, buckets, hidden_edim, hidden_units, num_layers, keep_prob, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, beam_size, use_lstm=False, forward_only=False): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. hidden_edim: number of dimensions for word embedding hidden_units: number of hidden units for each layer num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. beam_size: the beam size used in beam search. use_lstm: if true, we use LSTM cells instead of GRU cells. forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) def loss_function(logit, target, output_projection): logit = math_ops.matmul(logit, output_projection, transpose_b=True) target = array_ops.reshape(target, [-1]) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( logit, target) return crossent softmax_loss_function = loss_function # Create the internal multi-layer cell for our RNN. single_cell = rnn_cell.GRUCell(hidden_units) if use_lstm: single_cell = rnn_cell.BasicLSTMCell( hidden_units) # added by yfeng cell = single_cell if num_layers > 1: cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) if not forward_only: cell = rnn_cell.DropoutWrapper(cell, input_keep_prob=keep_prob, seed=SEED) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, encoder_mask, encoder_probs, encoder_ids, encoder_hs, mem_mask, decoder_inputs, do_decode): return seq2seq_fy.embedding_attention_seq2seq( encoder_inputs, encoder_mask, encoder_probs, encoder_ids, encoder_hs, mem_mask, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=hidden_edim, beam_size=beam_size, num_layers=num_layers, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] self.decoder_aligns = [] self.decoder_align_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) self.decoder_aligns.append( tf.placeholder(tf.float32, shape=[None, None], name="align{0}".format(i))) self.decoder_align_weights.append( tf.placeholder(tf.float32, shape=[None], name="align_weight{0}".format(i))) self.encoder_mask = tf.placeholder(tf.int32, shape=[None, None], name="encoder_mask") self.encoder_probs = tf.placeholder( tf.float32, shape=[None, None, self.target_vocab_size], name="encoder_prob") self.encoder_ids = tf.placeholder(tf.int32, shape=[None, None], name="encoder_id") self.encoder_hs = tf.placeholder(tf.float32, shape=[None, None, None], name="encoder_h") self.mem_mask = tf.placeholder(tf.float32, shape=[None, None], name="mem_mask") # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. if forward_only: self.outputs, self.losses, self.symbols = seq2seq_fy.model_with_buckets( self.encoder_inputs, self.encoder_mask, self.encoder_probs, self.encoder_ids, self.encoder_hs, self.mem_mask, self.decoder_inputs, targets, self.target_weights, self.decoder_aligns, self.decoder_align_weights, buckets, lambda x, y, z, s, a, b, c: seq2seq_f(x, y, z, s, a, b, c, True ), softmax_loss_function=softmax_loss_function) else: self.outputs, self.losses, self.symbols = seq2seq_fy.model_with_buckets( self.encoder_inputs, self.encoder_mask, self.encoder_probs, self.encoder_ids, self.encoder_hs, self.mem_mask, self.decoder_inputs, targets, self.target_weights, self.decoder_aligns, self.decoder_align_weights, buckets, lambda x, y, z, s, a, b, c: seq2seq_f(x, y, z, s, a, b, c, False), softmax_loss_function=softmax_loss_function) # only update memory attention parameters params_to_update = [ p for p in tf.trainable_variables() if p.name in [ u'beta1_power:0', u'beta2_power:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0/Adam:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0/Adam_1:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0/Adam:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0/Adam_1:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix/Adam:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix/Adam_1:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias/Adam:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias/Adam_1:0' ] ] if not forward_only: self.gradient_norms = [] self.gradient_norms_print = [] self.updates = [] opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients( self.losses[b], params_to_update, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params_to_update), global_step=self.global_step)) # load trained NMT parameters params_to_load = [ p for p in tf.all_variables() if p.name not in [ u'beta1_power:0', u'beta2_power:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0/Adam:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0/Adam_1:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0/Adam:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0/Adam_1:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix/Adam:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix/Adam_1:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias/Adam:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias/Adam_1:0' ] ] # only save memory attention parameters params_to_save = [ p for p in tf.all_variables() if p.name in [ u'Variable:0', u'Variable_1:0', u'beta1_power:0', u'beta2_power:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0/Adam:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnVt_0/Adam_1:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0/Adam:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnWt_0/Adam_1:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix/Adam:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Matrix/Adam_1:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias/Adam:0', u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/attention/AttnU_0/Linear_mem/Bias/Adam_1:0', ] ] self.saver_old = tf.train.Saver(params_to_load, max_to_keep=1000, keep_checkpoint_every_n_hours=6) self.saver = tf.train.Saver(params_to_save, max_to_keep=1000, keep_checkpoint_every_n_hours=6)
def __init__(self, source_vocab_size, target_vocab_size, buckets, hidden_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, num_samples=-1, embedding_size=200, forward_only=False, beam_search=False, beam_size=10, category=6, use_emb=False, use_autoEM=False, use_imemory=False, use_ememory=False, emotion_size=200, imemory_size=256, dtype=tf.float32): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable( float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary hidden_size. if num_samples > 0 and num_samples < self.target_vocab_size: w_t = tf.get_variable("proj_w", [self.target_vocab_size, hidden_size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss(local_w_t, local_b, local_inputs, labels, num_samples, self.target_vocab_size), dtype) softmax_loss_function = sampled_loss else: w_t = tf.get_variable("proj_w", [self.target_vocab_size, hidden_size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) # Create the internal multi-layer cell for our RNN. gru = tf.nn.rnn_cell.GRUCell(hidden_size) encoder_cell = gru if num_layers > 1: encoder_cell = rnn_cell.MultiRNNCell([gru] * num_layers) # Create the internal multi-layer cell for our RNN. decoder_cell = encoder_cell if use_imemory or use_emb: decoder_cell = rnn_cell.MEMGRUCell(hidden_size) if num_layers > 1: decoder_cell = rnn_cell.MEMMultiRNNCell([decoder_cell]+[gru] * (num_layers-1)) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, encoder_emotions, decoder_emotions, do_decode, autoEM_logit): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, encoder_emotions, decoder_emotions, autoEM_logit, encoder_cell, decoder_cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=embedding_size, hidden_size=hidden_size, emotion_category=category, emotion_size=emotion_size, imemory_size=imemory_size, use_emb=use_emb, use_imemory=use_imemory, use_ememory=use_ememory, output_projection=output_projection, initial_state_attention=True, feed_previous=do_decode, dtype=dtype, beam_search=beam_search, beam_size=beam_size) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in range(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in range(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in range(len(self.decoder_inputs) - 1)] self.decoder_emotions = tf.placeholder(tf.int32, shape=[None,2], name="decoder_emotion") self.encoder_emotions = tf.placeholder(tf.int32, shape=[None,2], name="encoder_emotion") if use_autoEM: senti_cell = tf.nn.rnn_cell.GRUCell(hidden_size) grammar_cell = tf.nn.rnn_cell.GRUCell(hidden_size) if num_layers > 1: senti_cell = rnn_cell.MultiRNNCell([senti_cell] * num_layers) grammar_cell = rnn_cell.MultiRNNCell([grammar_cell] * num_layers) self.autoEM_losses, self.pos_logits, self.res_logits, self.res_losses ,self.res_cross_entropy,self.rlabels,self.weight = seq2seq.classify_model_with_buckets(senti_cell, grammar_cell, self.encoder_inputs, self.encoder_emotions, self.decoder_emotions, buckets, hidden_size, embedding_size, category, source_vocab_size) self.pos_predics = [] self.res_predics = [] for each in self.pos_logits: self.pos_predics.append(tf.arg_max(each, 1)) for each in self.res_logits: self.res_predics.append(tf.arg_max(each, 1)) # Training outputs and losses. if forward_only: if beam_search: self.outputs, self.beam_results, self.beam_symbols, self.beam_parents = seq2seq.decode_model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.encoder_emotions, self.decoder_emotions, buckets, lambda w, x, y, z,m: seq2seq_f(w, x, y, z, True, m), softmax_loss_function=softmax_loss_function) else: self.outputs, self.losses, self.ppxes= seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.encoder_emotions, self.decoder_emotions, buckets, lambda w, x, y, z, m: seq2seq_f(w, x, y, z, True, m),self.res_logits, softmax_loss_function=softmax_loss_function, use_imemory=use_imemory, use_ememory=use_ememory) else: self.outputs, self.losses, self.ppxes = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.encoder_emotions, self.decoder_emotions, buckets, lambda w, x, y, z, m: seq2seq_f(w, x, y, z, False, m),self.res_logits, softmax_loss_function=softmax_loss_function, use_imemory=use_imemory, use_ememory=use_ememory) self.total_losses = (1 * np.array(self.autoEM_losses) + 1 * np.array(self.losses)).tolist() # self.total_losses = self.autoEM_losses # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in range(len(buckets)): # len(buckets) is 4 on this occasion gradients = tf.gradients(self.total_losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.pretrain_var = [] self.initial_var = [] for i in tf.trainable_variables(): if 'Emotion' not in i.name and 'emotion' not in i.name and 'memory' not in i.name and 'Memory' not in i.name and "classify" not in i.name and "Attention_0" not in i.name: self.pretrain_var.append(i) for i in tf.all_variables(): if i not in self.pretrain_var: self.initial_var.append(i) self.pretrain_saver = tf.train.Saver(self.pretrain_var, write_version=tf.train.SaverDef.V2) self.saver = tf.train.Saver(tf.all_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=400)
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = rnn_cell.GRUCell(size) if use_lstm: single_cell = rnn_cell.BasicLSTMCell(size) cell = single_cell if num_layers > 1: cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, source_vocab_size, target_vocab_size, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.target_vocab_size, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs[b] ] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, self.target_vocab_size, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())
def __init__( self, source_vocab_size_1, source_vocab_size_2, target_vocab_size, buckets, # size, #annotated by yfeng hidden_edim, hidden_units, # added by yfeng num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, beam_size, # added by shiyue constant_emb_en, # added by al constant_emb_fr, # added by al use_lstm=False, num_samples=10240, forward_only=False): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. #size: number of units in each layer of the model.#annotated by yfeng hidden_edim: number of dimensions for word embedding hidden_units: number of hidden units for each layer num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size_1 = source_vocab_size_1 self.source_vocab_size_2 = source_vocab_size_2 self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. # if num_samples > 0 and num_samples < self.target_vocab_size: if num_samples > 0: # w = tf.get_variable("proj_w", [size, self.target_vocab_size]) #annotated by feng w = tf.get_variable("proj_w", [hidden_units // 2, self.target_vocab_size], initializer=tf.random_normal_initializer( 0, 0.01, seed=SEED)) # added by yfeng # w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size], initializer=tf.constant_initializer(0.0), trainable=False) # added by yfeng output_projection = (w, b) def sampled_loss(logit, target): # labels = tf.reshape(labels, [-1, 1]) logit = nn_ops.xw_plus_b(logit, output_projection[0], output_projection[1]) # return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, # self.target_vocab_size) target = array_ops.reshape(target, [-1]) return nn_ops.sparse_softmax_cross_entropy_with_logits( logit, target) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. # single_cell = tf.nn.rnn_cell.GRUCell(hidden_units) #annotated by yfeng single_cell = rnn_cell.GRUCell(hidden_units) # added by yfeng if use_lstm: # single_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_units) #annotated by yfeng single_cell = rnn_cell.BasicLSTMCell( hidden_units) # added by yfeng cell = single_cell if num_layers > 1: # modified by yfeng # cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) # end by yfeng cell = rnn_cell.DropoutWrapper(cell, input_keep_prob=0.8, seed=SEED) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs_1, encoder_inputs_2, encoder_mask_1, encoder_mask_2, decoder_inputs, do_decode): # return tf.nn.seq2seq.embedding_attention_seq2seq( #annnotated by yfeng return seq2seq_al.embedding_attention_seq2seq( # added by yfeng encoder_inputs_1, encoder_inputs_2, encoder_mask_1, encoder_mask_2, decoder_inputs, cell, num_encoder_symbols_1=source_vocab_size_1, num_encoder_symbols_2=source_vocab_size_2, num_decoder_symbols=target_vocab_size, # embedding_size=size, #annotated by yfeng embedding_size=hidden_edim, # added by yfeng beam_size=beam_size, # added by shiyue constant_emb_en=constant_emb_en, # added by al constant_emb_fr=constant_emb_fr, # added by al output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs_1 = [] self.encoder_inputs_2 = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs_1.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}_1".format(i))) for i in xrange(buckets[-1][1]): # Last bucket is the biggest one. self.encoder_inputs_2.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}_2".format(i))) for i in xrange(buckets[-1][2] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) self.encoder_mask_1 = tf.placeholder(tf.int32, shape=[None, None], name="encoder_mask_1") self.encoder_mask_2 = tf.placeholder(tf.int32, shape=[None, None], name="encoder_mask_2") # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. if forward_only: # self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( #annotated by yfeng self.outputs, self.losses, self.symbols = seq2seq_al.model_with_buckets( # added by yfeng and shiyue self.encoder_inputs_1, self.encoder_inputs_2, self.encoder_mask_1, self.encoder_mask_2, self.decoder_inputs, targets, self.target_weights, buckets, lambda x1, x2, y1, y2, z: seq2seq_f(x1, x2, y1, y2, z, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. # annotated by shiyue, when using beam search, no need to do decoding projection # if output_projection is not None: # for b in xrange(len(buckets)): # self.outputs[b] = [ # tf.matmul(output, output_projection[0]) + output_projection[1] # for output in self.outputs[b] # ] # ended by shiyue else: # self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( #annotated by yfeng self.outputs, self.losses, self.symbols = seq2seq_al.model_with_buckets( # added by yfeng and shiyue self.encoder_inputs_1, self.encoder_inputs_2, self.encoder_mask_1, self.encoder_mask_2, self.decoder_inputs, targets, self.target_weights, buckets, lambda x1, x2, y1, y2, z: seq2seq_f(x1, x2, y1, y2, z, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params_to_update = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.gradient_norms_print = [] self.updates = [] # opt = tf.train.AdadeltaOptimizer(learning_rate=self.learning_rate, rho=0.95, epsilon=1e-6) opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate) # opt = tf.train.GradientDescentOptimizer(self.learning_rate) #added by yfeng for b in xrange(len(buckets)): gradients = tf.gradients( self.losses[b], params_to_update, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) # gradients_print = tf.gradients(self.losses[b], params_to_print) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) # _, norm_print = tf.clip_by_global_norm(gradients_print, # max_gradient_norm) self.gradient_norms.append(norm) # self.gradient_norms_print.append(norm_print) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params_to_update), global_step=self.global_step)) # self.saver = tf.train.Saver(tf.all_variables()) #annotated by yfeng self.saver = tf.train.Saver( tf.all_variables(), max_to_keep=1000, keep_checkpoint_every_n_hours=6) # added by yfeng
def __init__(self, is_training=False, hidden_units=128, num_layers=1, input_sequence_len=20, output_sequence_len=10, num_input_symbols=20, num_output_symbols=20, weight_amplitude=0.08, batch_size=32, peep=False): self.encoder_inputs = [] self.decoder_inputs = [] for i in range(input_sequence_len): self.encoder_inputs.append(tf.placeholder(tf.float32, shape=(None, num_input_symbols), name="encoder_{0}".format(i))) for i in range(output_sequence_len + 1): self.decoder_inputs.append(tf.placeholder(tf.float32, shape=(None, num_output_symbols), name="decoder_{0}".format(i))) def random_uniform(): return tf.random_uniform_initializer(-weight_amplitude, weight_amplitude) if num_layers > 1: cells = [rnn_cell.LSTMCell(hidden_units, use_peepholes=peep, input_size=num_input_symbols, initializer=random_uniform())] cells += [rnn_cell.LSTMCell(hidden_units, use_peepholes=peep, input_size=hidden_units, initializer=random_uniform()) for _ in range(num_layers - 1)] self.cell = rnn_cell.MultiRNNCell(cells) else: self.cell = rnn_cell.LSTMCell(hidden_units, use_peepholes=peep, initializer=random_uniform()) self.w_softmax = tf.get_variable('w_softmax', shape=(hidden_units, num_output_symbols), initializer=random_uniform()) self.b_softmax = tf.get_variable('b_softmax', shape=(num_output_symbols,), initializer=random_uniform()) # decoder_outputs is a list of tensors with output_sequence_len: [(batch_size x hidden_units)] decoder_outputs, _ = self._init_seq2seq(self.encoder_inputs, self.decoder_inputs, self.cell, feed_previous=not is_training) output_logits = [tf.matmul(decoder_output, self.w_softmax) + self.b_softmax for decoder_output in decoder_outputs] self.output_probs = [tf.nn.softmax(logit) for logit in output_logits] # If this is a training model create the training operation and loss function if is_training: self.targets = self.decoder_inputs[1:] losses = [tf.nn.softmax_cross_entropy_with_logits(logit, target) for logit, target in zip(output_logits, self.targets)] loss = tf.reduce_sum(tf.add_n(losses)) self.cost = loss / output_sequence_len / batch_size self.learning_rate = tf.Variable(DEFAULT_LEARNING_RATE, trainable=False) train_vars = tf.trainable_variables() grads = tf.gradients(self.cost, train_vars) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, train_vars))
def __init__(self, source_vocab_size, target_vocab_size, buckets, hidden_edim, hidden_units, num_layers, keep_prob, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, beam_size, forward_only=False): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. hidden_edim: number of dimensions for word embedding hidden_units: number of hidden units for each layer num_layers: number of layers in the model. keep_prob: keep probability used for dropout. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. beam_size: the beam size for beam search decoding forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) w = tf.get_variable("proj_w", [hidden_units // 2, self.target_vocab_size], initializer=tf.random_normal_initializer(0, 0.01, seed=123)) b = tf.get_variable("proj_b", [self.target_vocab_size], initializer=tf.constant_initializer(0.0), trainable=False) output_projection = (w, b) # before softmax, there is an output projection def softmax_loss_function(logit, target): # loss function of seq2seq model logit = nn_ops.xw_plus_b(logit, output_projection[0], output_projection[1]) target = array_ops.reshape(target, [-1]) return nn_ops.sparse_softmax_cross_entropy_with_logits( logit, target) single_cell = rnn_cell.GRUCell(hidden_units) cell = single_cell if num_layers > 1: cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) if not forward_only: cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=float(keep_prob), seed=123) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, encoder_mask, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, encoder_mask, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=hidden_edim, beam_size=beam_size, output_projection=output_projection, num_layers=num_layers, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) self.encoder_mask = tf.placeholder(tf.int32, shape=[None, None], name="encoder_mask") # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] # Training outputs and losses. if forward_only: self.outputs, self.losses, self.symbols = seq2seq.model_with_buckets( self.encoder_inputs, self.encoder_mask, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, True), softmax_loss_function=softmax_loss_function) else: self.outputs, self.losses, self.symbols = seq2seq.model_with_buckets( self.encoder_inputs, self.encoder_mask, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params_to_update = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.gradient_norms_print = [] self.updates = [] opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params_to_update, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params_to_update), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables(), max_to_keep=1000, # keep all checkpoints keep_checkpoint_every_n_hours=6)