def add_optimizer(self, global_step): '''Adds optimizer to the graph. Supposes that initialize function has already been called. ''' with tf.variable_scope('optimizer'): hp = self._hparams #Adam with constant learning rate optimizer = tf.train.AdamOptimizer(hp.wavenet_learning_rate, hp.wavenet_adam_beta1, hp.wavenet_adam_beta2, hp.wavenet_adam_epsilon) gradients, variables = zip(*optimizer.compute_gradients(self.loss)) self.gradients = gradients #Gradients clipping clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.) with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): adam_optimize = optimizer.apply_gradients(zip(clipped_gradients, variables), global_step=global_step) #Add exponential moving average #https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage #Use adam optimization process as a dependency with tf.control_dependencies([adam_optimize]): #Create the shadow variables and add ops to maintain moving averages #Also updates moving averages after each update step #This is the optimize call instead of traditional adam_optimize one. assert tuple(self.variables) == variables #Verify all trainable variables are being averaged self.optimize = self.ema.apply(variables)
def add_training_ops(self, learning_rate: bool = 1e-3, learning_rate_decay_factor: float = 0, max_gradient_norm: float = 5.0, momentum: float = 0.9): """ Add the ops for training Args: learning_rate: the inital learning rate learning_rate_decay_factor: the factor to multiple the learning rate with when it should be decreased max_gradient_norm: the maximum gradient norm to apply, otherwise clipping is applied momentum: the momentum parameter """ self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32, name='learning_rate') self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor) # Variable summaries tf.summary.scalar('learning_rate', self.learning_rate) # Define loss and optimizer if self.labels is not None: with tf.name_scope('training'): self.cost = tf.nn.ctc_loss(self.labels, self.logits, self.sequence_lengths // 2) self.avg_loss = tf.reduce_mean(self.cost, name='average_loss') tf.summary.scalar('loss', self.avg_loss) optimizer = tf.train.AdamOptimizer(self.learning_rate, epsilon=1e-3) gvs = optimizer.compute_gradients(self.avg_loss) gradients, trainables = zip(*gvs) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm, name='clip_gradients') self.update = optimizer.apply_gradients(zip(clipped_gradients, trainables), global_step=self.global_step, name='apply_gradients')
def __init__(self, config): sent_len = config.sent_len batch_size = config.batch_size vocab_size = config.vocab_size embed_size = config.embed_size num_layers = config.num_layers state_size = config.state_size keep_prob = config.keep_prob self.input_data = tf.placeholder(tf.int32, [batch_size, sent_len]) self.lengths = tf.placeholder(tf.int64, [batch_size]) self.targets = tf.placeholder(tf.float32, [batch_size, 1]) # Get embedding layer which requires CPU with tf.device("/cpu:0"): embeding = tf.get_variable("embeding", [vocab_size, embed_size]) inputs = tf.nn.embedding_lookup(embeding, self.input_data) #LSTM 1 -> Encode the characters of every tok into a fixed dense representation with tf.variable_scope("rnn1", reuse=None): cell = rnn_cell.LSTMCell(state_size, input_size=embed_size, initializer=tf.contrib.layers.xavier_initializer()) back_cell = rnn_cell.LSTMCell(state_size, input_size=embed_size, initializer=tf.contrib.layers.xavier_initializer()) cell = rnn_cell.DropoutWrapper( cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob) back_cell = rnn_cell.DropoutWrapper( back_cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob) cell = rnn_cell.MultiRNNCell([cell] * num_layers) backcell = rnn_cell.MultiRNNCell([back_cell] * num_layers) rnn_splits = [tf.squeeze(input_, [1]) for input_ in tf.split(1, sent_len, inputs)] # Run the bidirectional rnn outputs, last_fw_state, last_bw_state = rnn.bidirectional_rnn( cell, backcell, rnn_splits, sequence_length=self.lengths, dtype=tf.float32) sent_out = tf.concat(1, [last_fw_state, last_bw_state]) #sent_out = outputs[-1] #sent_out = tf.add_n(outputs) output_size = state_size*4 with tf.variable_scope("linear", reuse=None): w = tf.get_variable("w", [output_size, 1]) b = tf.get_variable("b", [1], initializer=tf.constant_initializer(0.0)) raw_logits = tf.matmul(sent_out, w) + b self.probabilities = tf.sigmoid(raw_logits) self.cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(raw_logits, self.targets)) #Calculate gradients and propagate #Aggregation method 2 is really important for rnn per the tensorflow issues list tvars = tf.trainable_variables() self.lr = tf.Variable(0.0, trainable=False) #Assign to overwrite optimizer = tf.train.AdamOptimizer() grads, _vars = zip(*optimizer.compute_gradients(self.cost, tvars, aggregation_method=2)) grads, self.grad_norm = tf.clip_by_global_norm(grads, config.max_grad_norm) self.train_op = optimizer.apply_gradients(zip(grads, _vars))
def _add_shared_train_op(self): """Sets self._train_op, the op to run for training.""" # Take gradients of the trainable variables w.r.t. the loss function to minimize if self._hps.rl_training or self._hps.ac_training: loss_to_minimize = self._reinforce_shared_loss if self._hps.coverage: loss_to_minimize = self._reinforce_cov_total_loss else: loss_to_minimize = self._pgen_loss if self._hps.coverage: loss_to_minimize = self._pointer_cov_total_loss tvars = tf.trainable_variables() gradients = tf.gradients(loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) # Clip the gradients with tf.device("/gpu:{}".format(self._hps.gpu_num)): grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm) # Add a summary tf.summary.scalar('global_norm', global_norm) # Apply adagrad optimizer optimizer = tf.train.AdagradOptimizer(self._hps.lr, initial_accumulator_value=self._hps.adagrad_init_acc) with tf.device("/gpu:{}".format(self._hps.gpu_num)): self._shared_train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step')
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size size = config.hidden_size self.max_len = max_len = config.max_len vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, config.max_len]) self._targets = tf.placeholder(tf.int32, [batch_size]) embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) output = tf.reduce_sum(inputs, 1) softmax_w = tf.get_variable("softmax_w", [size, 2]) softmax_b = tf.get_variable("softmax_b", [2]) logits = tf.matmul(output, softmax_w) + softmax_b prediction = tf.nn.softmax(logits) self._prediction = prediction loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, self._targets) self._cost = cost = tf.reduce_sum(loss) / batch_size if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self): self.batchsize = 32 self.vocabsize = (10 * 1000) + 2 self.word_embed_size = 300 self.sentence_length = 30 self.dropout_prob = 10 self.num_layers = 1 self.decoder_hidden_size = 500 self.max_gradient_norm = 5.0 self.sentence_embed_size = 500 self.sentences_in = tf.placeholder(tf.int32, [self.batchsize, self.sentence_length]) self.sentences_in_decoded = tf.placeholder(tf.int32, [self.batchsize, self.sentence_length]) self.d = self.decoder() flat_in = tf.reshape(self.sentences_in, [self.batchsize * self.sentence_length,1]) flat_d = tf.reshape(self.d, [self.batchsize * self.sentence_length, self.decoder_hidden_size]) cross_entropy = tf.nn.sampled_softmax_loss(tf.transpose(self.d_w2), self.d_b2, flat_d, flat_in, 512, self.vocabsize) self.generation_loss = tf.reduce_sum(tf.reshape(cross_entropy, [self.batchsize, self.sentence_length]), reduction_indices=1) self.cost = tf.reduce_mean(self.generation_loss) params = tf.trainable_variables() gradients = tf.gradients(self.cost, params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, self.max_gradient_norm) self.optim = tf.train.AdamOptimizer(0.0001) self.update = self.optim.apply_gradients(zip(clipped_gradients, params)) self.sess = tf.Session() self.sess.run(tf.initialize_all_variables())
def optimizer(someloss): global_step = tf.Variable(0) optimizer = tf.train.AdamOptimizer(learning_rate=0.001) gradients, v = zip(*optimizer.compute_gradients(someloss)) gradients, _ = tf.clip_by_global_norm(gradients, 1.25) optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step) return optimizer
def create_critic_train_op(hparams, critic_loss, global_step): """Create Discriminator train op.""" with tf.name_scope('train_critic'): critic_optimizer = tf.train.AdamOptimizer(hparams.critic_learning_rate) output_vars = [ v for v in tf.trainable_variables() if v.op.name.startswith('critic') ] if FLAGS.critic_update_dis_vars: if FLAGS.discriminator_model == 'bidirectional_vd': critic_vars = [ v for v in tf.trainable_variables() if v.op.name.startswith('dis/rnn') ] elif FLAGS.discriminator_model == 'seq2seq_vd': critic_vars = [ v for v in tf.trainable_variables() if v.op.name.startswith('dis/decoder/rnn/multi_rnn_cell') ] critic_vars.extend(output_vars) else: critic_vars = output_vars print('\nOptimizing Critic vars:') for v in critic_vars: print(v) critic_grads = tf.gradients(critic_loss, critic_vars) critic_grads_clipped, _ = tf.clip_by_global_norm(critic_grads, FLAGS.grad_clipping) critic_train_op = critic_optimizer.apply_gradients( zip(critic_grads_clipped, critic_vars), global_step=global_step) return critic_train_op, critic_grads_clipped, critic_vars
def create_gen_train_op(hparams, learning_rate, gen_loss, global_step, mode): """Create Generator train op.""" del hparams with tf.name_scope('train_generator'): if FLAGS.generator_optimizer == 'sgd': gen_optimizer = tf.train.GradientDescentOptimizer(learning_rate) elif FLAGS.generator_optimizer == 'adam': gen_optimizer = tf.train.AdamOptimizer(learning_rate) else: raise NotImplementedError gen_vars = [ v for v in tf.trainable_variables() if v.op.name.startswith('gen') ] print('Optimizing Generator vars.') for v in gen_vars: print(v) if mode == 'MINIMIZE': gen_grads = tf.gradients(gen_loss, gen_vars) elif mode == 'MAXIMIZE': gen_grads = tf.gradients(-gen_loss, gen_vars) else: raise ValueError("Must be one of 'MINIMIZE' or 'MAXIMIZE'") gen_grads_clipped, _ = tf.clip_by_global_norm(gen_grads, FLAGS.grad_clipping) gen_train_op = gen_optimizer.apply_gradients( zip(gen_grads_clipped, gen_vars), global_step=global_step) return gen_train_op, gen_grads_clipped, gen_vars
def create_optimizer(cost,learning_rate): optimizer = tf.train.AdamOptimizer(learning_rate) grad_clip = 5. tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip) train_step = optimizer.apply_gradients(zip(grads, tvars)) return train_step
def __init__(self, loss, global_step, optimizer, learning_rate, clip_gradients=5.0): """Build a trainer part of graph. Args: loss: Tensor that evaluates to model's loss. global_step: Tensor with global step of the model. optimizer: Name of the optimizer class (SGD, Adam, Adagrad) or class. """ self.loss = loss self.global_step = global_step self._learning_rate = tf.get_variable( "learning_rate", [], initializer=tf.constant_initializer(learning_rate)) params = tf.trainable_variables() self.gradients = tf.gradients(loss, params) if clip_gradients > 0.0: self.gradients, self.gradients_norm = tf.clip_by_global_norm( self.gradients, clip_gradients) grads_and_vars = zip(self.gradients, params) if isinstance(optimizer, str): self._optimizer = OPTIMIZER_CLS_NAMES[ optimizer](self._learning_rate) else: self._optimizer = optimizer(self.learning_rate) self.trainer = self._optimizer.apply_gradients(grads_and_vars, global_step=global_step, name="train") # Get all initializers for all trainable variables. self._initializers = tf.initialize_all_variables()
def build_training(self): print(' Building training') self.global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(self.learning_rate) # Do gradient clipping # NOTE: this is the correct, but slower clipping by global norm. # Maybe it's worth trying the faster tf.clip_by_norm() # (See the documentation for tf.clip_by_global_norm() for more info) grads_and_vars = optimizer.compute_gradients(self.loss) gradients, variables = zip(*grads_and_vars) # unzip list of tuples clipped_gradients, global_norm = ( tf.clip_by_global_norm(gradients, self.clip_norm) ) clipped_grads_and_vars = zip(clipped_gradients, variables) # Create TensorBoard scalar summary for global gradient norm tf.scalar_summary('train/global gradient norm', global_norm) # Create TensorBoard summaries for gradients # for grad, var in grads_and_vars: # # Sparse tensor updates can't be summarized, so avoid doing that: # if isinstance(grad, tf.Tensor): # tf.histogram_summary('grad_' + var.name, grad) # make training op for applying the gradients self.train_op = optimizer.apply_gradients(clipped_grads_and_vars, global_step=self.global_step)
def build_rmsprop_optimizer(self, learning_rate, rmsprop_decay, rmsprop_constant, gradient_clip, version): with tf.name_scope('rmsprop'): optimizer = tf.train.GradientDescentOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(self.loss) grads = [gv[0] for gv in grads_and_vars] params = [gv[1] for gv in grads_and_vars] if gradient_clip > 0: grads = tf.clip_by_global_norm(grads, gradient_clip) if version == 'rmsprop': return optimizer.apply_gradients(zip(grads, params)) elif version == 'graves_rmsprop': square_grads = [tf.square(grad) for grad in grads] avg_grads = [tf.Variable(tf.ones(var.get_shape())) for var in params] avg_square_grads = [tf.Variable(tf.ones(var.get_shape())) for var in params] update_avg_grads = [grad_pair[0].assign((rmsprop_decay * grad_pair[0]) + ((1 - rmsprop_decay) * grad_pair[1])) for grad_pair in zip(avg_grads, grads)] update_avg_square_grads = [grad_pair[0].assign((rmsprop_decay * grad_pair[0]) + ((1 - rmsprop_decay) * tf.square(grad_pair[1]))) for grad_pair in zip(avg_square_grads, grads)] avg_grad_updates = update_avg_grads + update_avg_square_grads rms = [tf.sqrt(avg_grad_pair[1] - tf.square(avg_grad_pair[0]) + rmsprop_constant) for avg_grad_pair in zip(avg_grads, avg_square_grads)] rms_updates = [grad_rms_pair[0] / grad_rms_pair[1] for grad_rms_pair in zip(grads, rms)] train = optimizer.apply_gradients(zip(rms_updates, params)) return tf.group(train, tf.group(*avg_grad_updates))
def train_neural_network(): logits, last_state, _, _, _ = neural_network() targets = tf.reshape(output_targets, [-1]) loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)], len(words)) cost = tf.reduce_mean(loss)#arvrage值 learning_rate = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() #当在一次迭代中权重的更新过于迅猛的话,很容易导致loss divergence。Gradient Clipping的直观作用就是让权重的更新限制在一个合适的范围 grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5) optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.apply_gradients(zip(grads, tvars)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) for epoch in range(50): sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch))) n = 0 for batche in range(n_chunk): train_loss, _, _ = sess.run([cost, last_state, train_op], feed_dict={input_data: x_batches[n], output_targets: y_batches[n]}) n += 1 print(epoch, batche, train_loss) if epoch % 7 == 0: saver.save(sess, 'poetry.module', global_step=epoch)
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Applying gradients and tune hyperparams with YellowFin. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: (A group of operations) Variable Update with Momentum ops, YellowFin ops(Curvature, Variance, Distance) ops, SingleStep and lr_mu tuning ops, Step increment ops. """ self._grad, self._vars = zip(*[(g, t) for g, t in grads_and_vars if g is not None]) # Var update with Momentum. with tf.variable_scope("apply_updates"): # Gradient Clipping? if self._clip_thresh_var is not None: self._grad, _ = tf.clip_by_global_norm( self._grad, self._clip_thresh_var) apply_grad_op = self._momentum_optimizer.apply_gradients( zip(self._grad, self._vars), global_step=global_step, name=name) else: apply_grad_op = self._momentum_optimizer.apply_gradients( zip(self._grad, self._vars), global_step=global_step, name=name) # Begin lr and mu tuning. with tf.variable_scope("prepare_yellowFin_variables"): # the dependencies ideally only need to be after clip is done, # i.e. depends on self._grads. However, the control_dependencies # does not support indexed slice for sparse gradients. # The alternative dependencies here might be slightly slower due # to less parallelization. with tf.control_dependencies([apply_grad_op,]): prepare_variables_op = self._prepare_variables() with tf.variable_scope("yellowfin"): with tf.control_dependencies([prepare_variables_op]): yellowfin_op = self._yellowfin() # Update YellowFin step variable. with tf.control_dependencies([yellowfin_op]): self._increment_step_op = tf.assign_add(self._step, 1).op return tf.group(apply_grad_op, prepare_variables_op, yellowfin_op, self._increment_step_op)
def clip_by_global_norm_summary(t_list, clip_norm, norm_name, variables): # wrapper around tf.clip_by_global_norm that also does summary ops of norms # compute norms # use global_norm with one element to handle IndexedSlices vs dense norms = [tf.global_norm([t]) for t in t_list] # summary ops before clipping summary_ops = [] for ns, v in zip(norms, variables): name = 'norm_pre_clip/' + v.name.replace(":", "_") summary_ops.append(tf.summary.scalar(name, ns)) # clip clipped_t_list, tf_norm = tf.clip_by_global_norm(t_list, clip_norm) # summary ops after clipping norms_post = [tf.global_norm([t]) for t in clipped_t_list] for ns, v in zip(norms_post, variables): name = 'norm_post_clip/' + v.name.replace(":", "_") summary_ops.append(tf.summary.scalar(name, ns)) summary_ops.append(tf.summary.scalar(norm_name, tf_norm)) return clipped_t_list, tf_norm, summary_ops
def training_ops(self, loss): opt = self.get_optimizer() params = tf.trainable_variables() gradients = tf.gradients(loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) return opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
def train_op(loss, global_step, hparams): """Uses a gradient descent optimizer to minimize loss. Gradient descent is applied to the loss function with an exponentially decreasing learning rate. Args: loss: loss tensor to minimize. global_step: A tf.Variable of type int holding the global training step. hparams: HParams instance containing model hyperparameters. Returns: training_op: An op that performs weight updates on the model. learning_rate: An op that decays learning rate, if that option is set in `hparams`. """ if hparams.exponentially_decay_learning_rate: learning_rate = tf.train.exponential_decay(hparams.initial_learning_rate, global_step, hparams.decay_steps, hparams.decay_rate, staircase=True, name='learning_rate') else: learning_rate = tf.Variable(hparams.initial_learning_rate, trainable=False) opt = tf.train.AdagradOptimizer(learning_rate) params = tf.trainable_variables() gradients = tf.gradients(loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, hparams.clip_norm) training_op = opt.apply_gradients(zip(clipped_gradients, params), global_step=global_step) return training_op, learning_rate
def make_train_op(loss, ema_decay=None, prefix=None): optimizer = COCOB() glob_step = tf.train.get_global_step() # Add regularization losses reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = loss + reg_losses if reg_losses else loss # Clip gradients grads_and_vars = optimizer.compute_gradients(total_loss) gradients, variables = zip(*grads_and_vars) clipped_gradients, glob_norm = tf.clip_by_global_norm(gradients, GRAD_CLIP_THRESHOLD) sgd_op, glob_norm = optimizer.apply_gradients(zip(clipped_gradients, variables)), glob_norm # Apply SGD averaging if ema_decay: ema = tf.train.ExponentialMovingAverage(decay=ema_decay, num_updates=glob_step) if prefix: # Some magic to handle multiple models trained in single graph ema_vars = [var for var in variables if var.name.startswith(prefix)] else: ema_vars = variables update_ema = ema.apply(ema_vars) with tf.control_dependencies([sgd_op]): training_op = tf.group(update_ema) else: training_op = sgd_op ema = None return training_op, glob_norm, ema
def _update_network(self, trainer): self.actions = tf.placeholder(shape=[None], dtype=tf.int32) self.actions_onehot = tf.one_hot( self.actions, self.a_dim, dtype=tf.float32) self.target_v = tf.placeholder(shape=[None], dtype=tf.float32) self.advantages = tf.placeholder(shape=[None], dtype=tf.float32) self.outputs = tf.reduce_sum( self.policy * self.actions_onehot, [1]) # loss self.value_loss = 0.5 * tf.reduce_sum(tf.square( self.target_v - tf.reshape(self.value, [-1]))) # higher entropy -> lower loss -> encourage exploration self.entropy = -tf.reduce_sum(self.policy * tf.log(self.policy)) self.policy_loss = -tf.reduce_sum( tf.log(self.outputs) * self.advantages) self.loss = 0.5 * self.value_loss \ + self.policy_loss - 0.01 * self.entropy # local gradients local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) # grads[i] * clip_norm / max(global_norm, clip_norm) grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, 40.0) # apply gradients to global network global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))
def add_train_op(self, lr_method, lr, loss, clip=-1): """Defines self.train_op that performs an update on a batch Args: lr_method: (string) sgd method, for example "adam" lr: (tf.placeholder) tf.float32, learning rate loss: (tensor) tf.float32 loss to minimize clip: (python float) clipping of gradient. If < 0, no clipping """ _lr_m = lr_method.lower() # lower to make sure with tf.variable_scope("train_step"): if _lr_m == 'adam': # sgd method optimizer = tf.train.AdamOptimizer(lr) elif _lr_m == 'adagrad': optimizer = tf.train.AdagradOptimizer(lr) elif _lr_m == 'sgd': optimizer = tf.train.GradientDescentOptimizer(lr) elif _lr_m == 'rmsprop': optimizer = tf.train.RMSPropOptimizer(lr) else: raise NotImplementedError("Unknown method {}".format(_lr_m)) if clip > 0: # gradient clipping if clip is positive grads, vs = zip(*optimizer.compute_gradients(loss)) grads, gnorm = tf.clip_by_global_norm(grads, clip) self.train_op = optimizer.apply_gradients(zip(grads, vs)) else: self.train_op = optimizer.minimize(loss)
def __init__(self, length_batch, features_batch, labels_batch): self.labels_flat = tf.reshape(labels_batch, [-1]) self.labels_one_hot = tf.one_hot(labels_batch, 26) self.labels_one_hot_flat = tf.reshape(self.labels_one_hot, [-1, 26]) self.lstm = tf.nn.rnn_cell.BasicLSTMCell(128) self.lstm_outputs, _ = tf.nn.dynamic_rnn( self.lstm, features_batch, sequence_length=length_batch, time_major=False, dtype=tf.float32) self.flat_lstm_outputs = tf.reshape(self.lstm_outputs, [-1, 128]) self.outputs = tflearn.fully_connected(self.flat_lstm_outputs, 26) # mask out padding self.losses = tf.nn.softmax_cross_entropy_with_logits(self.outputs, self.labels_one_hot_flat) self.mask = tf.to_float(tf.sign(self.labels_flat)) self.masked_losses = self.mask * self.losses self.mean_loss = tf.reduce_sum(self.masked_losses / tf.reduce_sum(self.mask)) self.predictions = tf.argmax(self.outputs, 1) self.accurate = tf.equal(self.predictions, self.labels_flat) self.accuracy = tf.reduce_sum(tf.to_float(self.accurate) * self.mask) / tf.reduce_sum(self.mask) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.mean_loss, tvars), 5.0) self.train = tf.train.GradientDescentOptimizer(0.1).apply_gradients(zip(grads, tvars))
def make_train_op(local_net, global_net): """ Use gradients from local network to update the global network """ # Idea: # We want a list of gradients and corresponding variables # e.g. [[g1, g2, g3], [v1, v2, v3]] # Since that's what the optimizer expects. # But we would like the gradients to come from the local network # And the variables to come from the global network # So we want to make a list like this: # [[local_g1, local_g2, local_g3], [global_v1, global_v2, global_v3]] # First get only the gradients local_grads, _ = zip(*local_net.grads_and_vars) # Clip gradients to avoid large values local_grads, _ = tf.clip_by_global_norm(local_grads, 5.0) # Get global vars _, global_vars = zip(*global_net.grads_and_vars) # Combine local grads and global vars local_grads_global_vars = list(zip(local_grads, global_vars)) # Run a gradient descent step, e.g. # var = var - learning_rate * grad return global_net.optimizer.apply_gradients( local_grads_global_vars, global_step=tf.train.get_global_step())
def defineTensorGradientDescent(self): self._learningRate = tf.Variable(0.0, trainable=False) trainingVars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainingVars),self.config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.learningRate) self._tensorGradientDescentTrainingOperation = optimizer.apply_gradients(zip(grads, trainingVars))
def __init__(self, vocab_size, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, dropout, forward_only=False): self.size = size self.vocab_size = vocab_size self.batch_size = batch_size self.num_layers = num_layers self.keep_prob = 1.0 - dropout self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.source_tokens = tf.placeholder(tf.int32, shape=[None, self.batch_size], name="source_tokens") self.target_tokens = tf.placeholder(tf.int32, shape=[None, self.batch_size], name="target_tokens") self.source_mask = tf.placeholder(tf.int32, shape=[None, self.batch_size], name="source_mask") self.target_mask = tf.placeholder(tf.int32, shape=[None, self.batch_size], name="target_mask") self.source_length = tf.reduce_sum(self.source_mask, reduction_indices=0) self.target_length = tf.reduce_sum(self.target_mask, reduction_indices=0) self.setup_embeddings() self.setup_encoder() self.setup_decoder() self.setup_loss() params = tf.trainable_variables() if not forward_only: opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.losses, params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms = norm self.updates = opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.all_variables())
def fit(self, data_function): with tf.Graph().as_default(), tf.Session() as sess: n, s, p = data_function.train.X.shape X_pl = tf.placeholder(tf.float32, [self.batch_size, s, p]) Y_pl = tf.placeholder(tf.float32, [self.batch_size, p]) lstm_cell = rnn_cell.BasicLSTMCell(self.hidden_size) cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers) outputs, _ = rnn.rnn(cell, [X_pl[:,i,:] for i in xrange(s)], dtype = tf.float32) softmax_w = tf.get_variable("softmax_w", [self.hidden_size, p]) softmax_b = tf.get_variable("softmax_b", [p]) logits = tf.matmul(outputs[-1], softmax_w) + softmax_b loss = loss_dict['ce'](logits, Y_pl) tvars = tf.trainable_variables() print([i.get_shape() for i in tvars]) grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), self.max_grad_norm) optimizer = tf.train.AdamOptimizer() train_op = optimizer.apply_gradients(zip(grads, tvars)) initializer = tf.random_uniform_initializer(-self.init_scale, self.init_scale) tf.initialize_all_variables().run() for i in xrange(self.n_step): batch_xs, batch_ys = data_function.train.next_batch( self.batch_size) feed_dict = {X_pl: batch_xs, Y_pl: batch_ys} _, loss_value = sess.run([train_op, loss], feed_dict = feed_dict) if i % 100 == 0: PrintMessage(data_function.train.epochs_completed, loss_value , 0, 0)
def training(hypes, loss, global_step, learning_rate, opt=None): """Sets up the training Ops. Creates a summarizer to track the loss over time in TensorBoard. Creates an optimizer and applies the gradients to all trainable variables. The Op returned by this function is what must be passed to the `sess.run()` call to cause the model to train. Args: loss: Loss tensor, from loss(). global_step: Integer Variable counting the number of training steps processed. learning_rate: The learning rate to use for gradient descent. Returns: train_op: The Op for training. """ # Add a scalar summary for the snapshot loss.'' sol = hypes["solver"] hypes['tensors'] = {} hypes['tensors']['global_step'] = global_step total_loss = loss['total_loss'] with tf.name_scope('training'): if opt is None: if sol['opt'] == 'RMS': opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.9, epsilon=sol['epsilon']) elif sol['opt'] == 'Adam': opt = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=sol['adam_eps']) elif sol['opt'] == 'SGD': lr = learning_rate opt = tf.train.GradientDescentOptimizer(learning_rate=lr) else: raise ValueError('Unrecognized opt type') hypes['opt'] = opt grads_and_vars = opt.compute_gradients(total_loss) if hypes['clip_norm'] > 0: grads, tvars = zip(*grads_and_vars) clip_norm = hypes["clip_norm"] clipped_grads, norm = tf.clip_by_global_norm(grads, clip_norm) grads_and_vars = zip(clipped_grads, tvars) train_op = opt.apply_gradients(grads_and_vars, global_step=global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grads_and_vars, global_step=global_step) return train_op
def __init__(self, model, optimizer, learning_rate, clip_gradients=5.0): """Build a trainer part of graph. Args: model: Model object, that has loss and global_step attributes. optimizer: Name of the optimizer class (SGD, Adam, Adagrad) or class. """ self.model = model self._learning_rate = tf.get_variable( "learning_rate", [], initializer=tf.constant_initializer(learning_rate)) params = tf.trainable_variables() self.gradients = tf.gradients(model.loss, params) if clip_gradients > 0.0: self.gradients, self.gradients_norm = tf.clip_by_global_norm( self.gradients, clip_gradients) grads_and_vars = zip(self.gradients, params) if isinstance(optimizer, str): self._optimizer = OPTIMIZER_CLS_NAMES[optimizer](self._learning_rate) else: self._optimizer = optimizer(self.learning_rate) self.trainer = self._optimizer.apply_gradients(grads_and_vars, global_step=model.global_step, name="train") # Get all initializers for all trainable variables. self._initializers = tf.initialize_all_variables()
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): """Creates an optimizer training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay( learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ( (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use # a different optimizer, you should probably take this line out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def get_graph(self, tensor_input=0): # # Define variables # - weights & bias on cells # - memory of previous cell's values # - output classifier # self.wCells = tf.Variable(tf.truncated_normal([self.nbInputs+self.nbCells, self.nbCells*4], -0.1, 0.1)) self.bCells = tf.Variable(tf.zeros([1, self.nbCells*4])) saved_output = tf.Variable(tf.truncated_normal([self.batchSize, self.nbCells], -0.1, 0.1), trainable=False) saved_state = tf.Variable(tf.truncated_normal([self.batchSize, self.nbCells], -0.1, 0.1), trainable=False) wClassif = tf.Variable(tf.truncated_normal([self.nbCells, self.nbOutputs], -0.1, 0.1)) bClassif = tf.Variable(tf.zeros([self.nbOutputs])) self.train_labels = tf.placeholder(tf.float32, shape=[1,self.nbOutputs]) # Feed <nbInputs> inputs to <nbCells> LSTM cells # which have <self.nbFrames> consecutive LSTMs # LSTM_inputs = list() # for _ in range(self.nbFrames): # LSTM_inputs.append( # tf.placeholder(tf.float32, shape=[self.batchSize,self.nbInputs])) # if tensor_input != 0: # LSTM_inputs = list() # for _ in range(self.nbFrames): # LSTM_inputs.append( # tensor_input) # Propagate images into LSTM cells # for fc6 in LSTM_inputs: # saved_output, saved_state = self.lstm_cell(fc6, saved_output, saved_state) saved_output, saved_state = self.lstm_cell(tensor_input, saved_output, saved_state) # State saving across unrollings. # control_dependencies => must be true to continue # with tf.control_dependencies([saved_output.assign(output), # saved_state.assign(state)]): # Classifier. self.logits = tf.nn.xw_plus_b(saved_output, wClassif, bClassif) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( self.logits, self.train_labels)) # Predictions. self.train_prediction = tf.nn.softmax(self.logits) # Optimizer. global_step = tf.Variable(0) self.learning_rate = tf.train.exponential_decay( 50.0, global_step, 5000, 0.8, staircase=True) self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) gradients, v = zip(*self.optimizer.compute_gradients(self.loss)) gradients, _ = tf.clip_by_global_norm(gradients, 1.25) self.optimizer = self.optimizer.apply_gradients( zip(gradients, v), global_step=global_step)
def __init__(self, num_emb, batch_size, emb_dim, hidden_dim, sequence_length, start_token, good_id, pos, learning_rate=0.01, reward_gamma=0.95): self.num_emb = num_emb self.batch_size = batch_size self.emb_dim = emb_dim self.hidden_dim = hidden_dim self.sequence_length = sequence_length self.start_token = tf.constant([start_token] * self.batch_size, dtype=tf.int32) self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.reward_gamma = reward_gamma self.g_params = [] self.d_params = [] self.good_id = tf.constant(good_id * self.batch_size, dtype=tf.int32) self.temperature = 2 self.grad_clip = 5.0 self.pos = pos self.expected_reward = tf.Variable(tf.zeros([self.sequence_length])) with tf.variable_scope('generator'): self.g_embeddings = tf.Variable( self.init_matrix_embedding([self.num_emb, self.emb_dim], self.pos)) self.g_params.append(self.g_embeddings) self.g_recurrent_unit = self.create_recurrent_unit( self.g_params) # maps h_tm1 to h_t for generator self.g_output_unit = self.create_output_unit( self.g_params) # maps h_t to o_t (output token logits) # placeholder definition self.x = tf.placeholder(tf.int32, shape=[self.batch_size, self.sequence_length]) # sequence of indices of true data, not including start token self.rewards = tf.placeholder( tf.float32, shape=[self.batch_size, self.sequence_length]) # get from rollout policy and discriminator # processed for batch with tf.device("/cpu:0"): inputs = tf.split( 1, self.sequence_length, tf.nn.embedding_lookup(self.g_embeddings, self.x)) self.processed_x = tf.pack([ tf.squeeze(input_, [1]) for input_ in inputs ]) # seq_length x batch_size x emb_dim with tf.device("/cpu:0"): inputs = tf.split(1, self.sequence_length, self.x) self.processed_token_x = tf.pack( [tf.squeeze(input_, [1]) for input_ in inputs]) self.h0 = tf.zeros([self.batch_size, self.hidden_dim]) self.h0 = tf.pack([self.h0, self.h0]) gen_o = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True) gen_x = tensor_array_ops.TensorArray(dtype=tf.int32, size=self.sequence_length, dynamic_size=False, infer_shape=True) ta_emb_x2 = tensor_array_ops.TensorArray(dtype=tf.int32, size=self.sequence_length) ta_emb_x2 = ta_emb_x2.unpack(self.processed_token_x) x_t = tf.nn.embedding_lookup(self.g_embeddings, self.start_token) h_t1 = self.g_recurrent_unit(x_t, self.h0) # hidden_memory_tuple o_t = self.g_output_unit(h_t1) # batch x vocab , logits not prob log_prob = tf.divide(tf.log(tf.nn.softmax(o_t)), self.temperature) next_token = tf.cast(tf.reshape(ta_emb_x2.read(0), [self.batch_size]), tf.int32) #next_token = tf.cast(tf.reshape(tf.multinomial(log_prob, 1), [self.batch_size]), tf.int32) x_tp1 = tf.nn.embedding_lookup(self.g_embeddings, next_token) # batch x emb_dim gen_o = gen_o.write( tf.constant(0, dtype=tf.int32), tf.reduce_sum( tf.mul(tf.one_hot(next_token, self.num_emb, 1.0, 0.0), tf.nn.softmax(o_t)), 1)) # [batch_size] , prob gen_x = gen_x.write(tf.constant(0, dtype=tf.int32), next_token) # indices, batch_size #random sampling: ''' x_t = tf.nn.embedding_lookup(self.g_embeddings,self.start_token) h_t1 = self.g_recurrent_unit(x_t, self.h0) # hidden_memory_tuple o_t = self.g_output_unit(h_t1) # batch x vocab , logits not prob next_token = tf.multinomial(tf.nn.softmax(o_t),1) next_token = tf.cast(tf.reshape(tf.multinomial(tf.nn.softmax(o_t),1),[self.batch_size]),tf.int32) x_tp1 = tf.nn.embedding_lookup(self.g_embeddings, next_token) # batch x emb_dim gen_o = gen_o.write(tf.constant(0,dtype=tf.int32), tf.reduce_sum(tf.mul(tf.one_hot(next_token, self.num_emb, 1.0, 0.0),tf.nn.softmax(o_t)), 1)) # [batch_size] , prob gen_x = gen_x.write(tf.constant(0,dtype=tf.int32), next_token) # indices, batch_size ''' def _g_recurrence(i, x_t, h_tm1, gen_o, gen_x): h_t = self.g_recurrent_unit(x_t, h_tm1) # hidden_memory_tuple o_t = self.g_output_unit(h_t) # batch x vocab , logits not prob print('now here') next_token = tf.argmax(tf.nn.softmax(o_t), 1) next_token = tf.cast( tf.reshape(tf.argmax(tf.nn.softmax(o_t), 1), [self.batch_size]), tf.int32) x_tp1 = tf.nn.embedding_lookup(self.g_embeddings, next_token) gen_o = gen_o.write( i, tf.reduce_sum( tf.mul(tf.one_hot(next_token, self.num_emb, 1.0, 0.0), tf.nn.softmax(o_t)), 1)) # [batch_size] , prob gen_x = gen_x.write(i, next_token) # indices, batch_size return i + 1, x_tp1, h_t, gen_o, gen_x _, _, _, self.gen_o, self.gen_x = tf.while_loop( cond=lambda i, _1, _2, _3, _4: i < self.sequence_length, body=_g_recurrence, loop_vars=(tf.constant(1, dtype=tf.int32), x_tp1, h_t1, gen_o, gen_x)) self.gen_x = self.gen_x.pack() # seq_length x batch_size self.gen_x = tf.transpose(self.gen_x, perm=[1, 0]) # batch_size x seq_length self.h0 = tf.zeros([self.batch_size, self.hidden_dim]) self.h0 = tf.pack([self.h0, self.h0]) gen_o_argmax = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True) gen_x_argmax = tensor_array_ops.TensorArray(dtype=tf.int32, size=self.sequence_length, dynamic_size=False, infer_shape=True) def _g_recurrence_argmax(i, x_t, h_tm1, gen_o, gen_x): h_t = self.g_recurrent_unit(x_t, h_tm1) # hidden_memory_tuple o_t = self.g_output_unit(h_t) # batch x vocab , logits not prob next_token = tf.argmax(tf.nn.softmax(o_t), 1) next_token = tf.cast( tf.reshape(tf.argmax(tf.nn.softmax(o_t), 1), [self.batch_size]), tf.int32) x_tp1 = tf.nn.embedding_lookup(self.g_embeddings, next_token) # batch x emb_dim print(x_tp1) gen_o = gen_o.write( i, tf.reduce_sum( tf.mul(tf.one_hot(next_token, self.num_emb, 1.0, 0.0), tf.nn.softmax(o_t)), 1)) # [batch_size] , prob gen_x = gen_x.write(i, next_token) # indices, batch_size return i + 1, x_tp1, h_t, gen_o, gen_x _, _, _, self.gen_o_argmax, self.gen_x_argmax = tf.while_loop( cond=lambda i, _1, _2, _3, _4: i < self.sequence_length, body=_g_recurrence_argmax, loop_vars=(tf.constant(0, dtype=tf.int32), tf.nn.embedding_lookup(self.g_embeddings, self.start_token), self.h0, gen_o_argmax, gen_x_argmax)) self.gen_x_argmax = self.gen_x_argmax.pack() # seq_length x batch_size self.gen_x_argmax = tf.transpose(self.gen_x_argmax, perm=[1, 0]) # batch_size x seq_length ################################################################### # wgan pretraining for generator g_predictions_wgan = tensor_array_ops.TensorArray( dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True) def _wgantrain_recurrence(i, x_t, h_tm1, g_predictions_wgan): h_t = self.g_recurrent_unit(x_t, h_tm1) o_t = self.g_output_unit(h_t) g_predictions_wgan = g_predictions_wgan.write( i, tf.nn.softmax(o_t)) # batch x vocab_size next_token = tf.argmax(tf.nn.softmax(o_t), 1) x_tp1 = tf.nn.embedding_lookup(self.g_embeddings, next_token) #x_tp1 = ta_emb_x.read(i) return i + 1, x_tp1, h_t, g_predictions_wgan _, _, _, self.g_predictions_wgan = tf.while_loop( cond=lambda i, _1, _2, _3: i < self.sequence_length, body=_wgantrain_recurrence, loop_vars=(tf.constant(0, dtype=tf.int32), tf.nn.embedding_lookup(self.g_embeddings, self.start_token), self.h0, g_predictions_wgan)) self.g_predictions_wgan = tf.transpose( self.g_predictions_wgan.pack(), perm=[1, 0, 2]) # batch_size x seq_length x vocab_size ################################################################### # supervised pretraining for generator g_predictions = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True) ta_emb_x = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length) ta_emb_x = ta_emb_x.unpack(self.processed_x) def _pretrain_recurrence(i, x_t, h_tm1, g_predictions): h_t = self.g_recurrent_unit(x_t, h_tm1) o_t = self.g_output_unit(h_t) g_predictions = g_predictions.write( i, tf.nn.softmax(o_t)) # batch x vocab_size x_tp1 = ta_emb_x.read(i) return i + 1, x_tp1, h_t, g_predictions _, _, _, self.g_predictions = tf.while_loop( cond=lambda i, _1, _2, _3: i < self.sequence_length, body=_pretrain_recurrence, loop_vars=(tf.constant(0, dtype=tf.int32), tf.nn.embedding_lookup(self.g_embeddings, self.start_token), self.h0, g_predictions)) self.g_predictions = tf.transpose( self.g_predictions.pack(), perm=[1, 0, 2]) # batch_size x seq_length x vocab_size # pretraining loss self.pretrain_loss = -tf.reduce_sum( tf.one_hot(tf.to_int32(tf.reshape( self.x, [-1])), self.num_emb, 1.0, 0.0) * tf.log( tf.clip_by_value( tf.reshape(self.g_predictions, [-1, self.num_emb]), 1e-20, 1.0))) / (self.sequence_length * self.batch_size) # training updates pretrain_opt = self.g_optimizer(self.learning_rate) self.pretrain_grad, _ = tf.clip_by_global_norm( tf.gradients(self.pretrain_loss, self.g_params), self.grad_clip) self.pretrain_updates = pretrain_opt.apply_gradients( zip(self.pretrain_grad, self.g_params)) ####################################################################################################### # Unsupervised Training ####################################################################################################### self.g_loss = -tf.reduce_sum( tf.reduce_sum( tf.one_hot(tf.to_int32(tf.reshape( self.x, [-1])), self.num_emb, 1.0, 0.0) * tf.log( tf.clip_by_value( tf.reshape(self.g_predictions, [-1, self.num_emb]), 1e-20, 1.0)), 1) * tf.reshape(self.rewards, [-1])) g_opt = self.g_optimizer(self.learning_rate) self.g_grad, _ = tf.clip_by_global_norm( tf.gradients(self.g_loss, self.g_params), self.grad_clip) self.g_updates = g_opt.apply_gradients(zip(self.g_grad, self.g_params))
def __init__(self, batchloader, is_training=True, without_label=False, ru=False): self.batchloader = batchloader self.ru = ru self.is_training = is_training self.without_label = without_label self.lr = tf.placeholder(tf.float32, shape=(), name="learning_rate") self.gumbel_temperature = tf.placeholder(tf.float32, shape=(), name="gumbel_temperature") with tf.name_scope("Placeholders"): self.encoder_input = tf.placeholder(tf.int64, shape=(FLAGS.BATCH_SIZE, FLAGS.SEQ_LEN), name="encoder_input") self.decoder_input = tf.placeholder(tf.int64, shape=(FLAGS.BATCH_SIZE, FLAGS.SEQ_LEN), name="decoder_input") self.target = tf.placeholder(tf.int64, shape=(FLAGS.BATCH_SIZE, FLAGS.SEQ_LEN), name="target") encoder_input_t = tf.transpose(self.encoder_input, perm=[1, 0]) self.encoder_input_list = [] decoder_input_t = tf.transpose(self.decoder_input, perm=[1, 0]) self.decoder_input_list = [] target_t = tf.transpose(self.target, perm=[1, 0]) self.target_list = [] self.step = tf.placeholder(tf.float32, shape=(), name="step") for i in range(FLAGS.SEQ_LEN): self.encoder_input_list.append(encoder_input_t[i]) assert self.encoder_input_list[i].shape == (FLAGS.BATCH_SIZE) self.decoder_input_list.append(decoder_input_t[i]) assert self.decoder_input_list[i].shape == (FLAGS.BATCH_SIZE) self.target_list.append(target_t[i]) assert self.target_list[i].shape == (FLAGS.BATCH_SIZE) if not without_label: self.label = tf.placeholder(tf.int64, shape=(FLAGS.BATCH_SIZE), name="label") self.label_onehot = tf.one_hot(self.label, FLAGS.LABEL_CLASS, name="label_onehot") assert self.label_onehot.shape == (FLAGS.BATCH_SIZE, FLAGS.LABEL_CLASS) with tf.variable_scope("Embedding"): self.embedding = tf.get_variable( name="embedding", shape=[FLAGS.VOCAB_SIZE, FLAGS.EMBED_SIZE], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) with tf.variable_scope("Encoder"): self.encoder = Encoder[FLAGS.ENCODER_NAME]( self.embedding, self.encoder_input_list, is_training=self.is_training, ru=self.ru) with tf.variable_scope("Discriminator"): self.discriminator = Discriminator(self.encoder.encoder_rnn_output, self.gumbel_temperature) if self.without_label: self.label_onehot = self.discriminator.discriminator_sampling_onehot assert self.label_onehot.shape == (FLAGS.BATCH_SIZE, FLAGS.LABEL_CLASS) with tf.name_scope("Latent_variables"): self.sampler = Sampler(self.encoder.encoder_rnn_output, self.label_onehot, is_training=self.is_training) if self.is_training: self.latent_variables = self.sampler.latent_variables else: self.latent_variables = tf.placeholder( tf.float32, shape=(FLAGS.BATCH_SIZE, FLAGS.LATENT_VARIABLE_SIZE), name="latent_variables_input") with tf.variable_scope("Decoder"): self.decoder = Decoder[FLAGS.DECODER_NAME]( self.decoder_input, self.latent_variables, self.label_onehot, self.embedding, self.batchloader, is_training=self.is_training, ru=self.ru) with tf.name_scope("Loss"): if not self.without_label: discriminator_correct = tf.equal( self.discriminator.discriminator_predict, self.label) self.discriminator_accuracy = tf.reduce_mean( tf.cast(discriminator_correct, tf.float32)) self.discriminator_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.discriminator.discriminator_logits, labels=self.label, name="labeled_discriminator_cross_entropy") ) * FLAGS.SEQ_LEN else: true_y = tf.fill([FLAGS.BATCH_SIZE, FLAGS.LABEL_CLASS], 1 / FLAGS.LABEL_CLASS, name="true_y_distribution") self.kld2 = tf.reduce_mean( tf.reduce_sum(self.discriminator.discriminator_prob * \ (tf.log(self.discriminator.discriminator_prob + 1e-6) - \ tf.log(true_y)), axis=1)) self.logits = self.decoder.logits self.kld = tf.reduce_mean( -0.5 * tf.reduce_sum(self.sampler.logvar - tf.square( self.sampler.mu) - tf.exp(self.sampler.logvar) + 1, axis=1)) self.kld_weight = tf.clip_by_value( FLAGS.INIT_KLD_WEIGHT + (1 - FLAGS.INIT_KLD_WEIGHT) * (self.step - FLAGS.KLD_ANNEAL_START) / (FLAGS.KLD_ANNEAL_END - FLAGS.KLD_ANNEAL_START), 0, 1) reconst_losses = [tf.nn.sparse_softmax_cross_entropy_with_logits( \ logits=logits, labels=targets) \ for logits, targets in zip(self.logits, self.target_list)] self.reconst_loss = tf.reduce_mean(reconst_losses) * FLAGS.SEQ_LEN if not self.without_label: self.loss = self.reconst_loss + self.kld_weight * self.kld \ + tf.log(1/FLAGS.LABEL_CLASS) + self.discriminator_loss else: self.loss = self.reconst_loss + self.kld_weight * self.kld + self.kld2 with tf.name_scope("Summary"): if self.is_training and not self.without_label: reconst_loss_summary = tf.summary.scalar( "labeled_reconst_loss", self.reconst_loss, family="train_loss") kld_summary = tf.summary.scalar("labeled_kld", self.kld, family="kld") disc_loss_summary = tf.summary.scalar( "labeled_disc_train_loss", self.discriminator_loss, family="disc_loss") disc_acc_summary = tf.summary.scalar( "labeled_disc_train_acc", self.discriminator_accuracy, family="disc_acc") kld_weight_summary = tf.summary.scalar("kld_weight", self.kld_weight, family="parameters") mu_summary = tf.summary.histogram( "labeled_mu", tf.reduce_mean(self.sampler.mu, 0)) var_summary = tf.summary.histogram( "labeled_var", tf.reduce_mean(tf.exp(self.sampler.logvar), 0)) lr_summary = tf.summary.scalar("lr", self.lr, family="parameters") self.merged_summary = tf.summary.merge([ reconst_loss_summary, kld_summary, disc_loss_summary, disc_acc_summary, kld_weight_summary, mu_summary, var_summary, lr_summary ]) elif self.is_training and self.without_label: reconst_loss_summary = tf.summary.scalar( "unlabeled_reconst_loss", self.reconst_loss, family="train_loss") kld_summary = tf.summary.scalar("unlabeled_kld", self.kld, family="kld") gumbel_summary = tf.summary.scalar("gumbel_temperature", self.gumbel_temperature, family="parameters") kld2_summary = tf.summary.scalar("unlabeled_kld2", self.kld2, family="kld") mu_summary = tf.summary.histogram( "unlabeled_mu", tf.reduce_mean(self.sampler.mu, 0)) var_summary = tf.summary.histogram( "unlabeled_var", tf.reduce_mean(tf.exp(self.sampler.logvar), 0)) self.merged_summary = tf.summary.merge([ reconst_loss_summary, kld_summary, gumbel_summary, kld2_summary, mu_summary, var_summary ]) else: valid_reconst_loss_summary = tf.summary.scalar( "valid_reconst_loss", self.reconst_loss, family="valid_loss") disc_loss_summary = tf.summary.scalar("disc_valid_loss", self.discriminator_loss, family="disc_loss") disc_acc_summary = tf.summary.scalar( "disc_valid_acc", self.discriminator_accuracy, family="disc_acc") self.merged_summary = tf.summary.merge([ valid_reconst_loss_summary, disc_loss_summary, disc_acc_summary ]) if self.is_training: tvars = tf.trainable_variables() with tf.name_scope("Optimizer"): tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.loss, tvars), FLAGS.MAX_GRAD) optimizer = tf.train.AdamOptimizer(self.lr, beta1=0.5) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
y_logits2 = dense(1, name='predictions2')(x_de2) y_logits = tf.concat([y_logits0, y_logits1, y_logits2], 1) n_samples = tf.reduce_sum(tf.count_nonzero(m, axis=1, dtype=tf.float32)) with tf.variable_scope('loss'): masked_cross_entropy = masked_sigmoid_cross_entropy_with_logits( logits=y_logits, labels=y, masks=m) loss = tf.reduce_sum(masked_cross_entropy) / n_samples with tf.variable_scope('train'): optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) optimize = optimizer.apply_gradients(zip(gradients, variables)) with tf.variable_scope('performance'): probabilities = tf.multiply(tf.sigmoid(y_logits), m) prediction = tf.round(probabilities) correct_prediction = tf.reduce_sum( tf.multiply(tf.cast(tf.equal(prediction, y), dtype=tf.float32), m)) accuracy = correct_prediction / n_samples with tf.variable_scope('tasks'): t0 = tf.constant([1, 0, 0], dtype=tf.float32) # psoriasis t1 = tf.constant([0, 1, 0], dtype=tf.float32) # acne or rosacea t2 = tf.constant([0, 0, 1], dtype=tf.float32) # mycosis t02 = tf.constant([1, 0, 1], dtype=tf.float32) # eczema
def setup_gradients(self): grads = tf.gradients(self.loss, self.var_list) self.grads, _ = tf.clip_by_global_norm(grads, 40.0) grads_and_vars = list(zip(self.grads, self.var_list)) opt = tf.train.AdamOptimizer(1e-4) self._apply_gradients = opt.apply_gradients(grads_and_vars)
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. dtype: the data type to use to store internal variables. """ # self.dtype = tf.float32 self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], tf.float32) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], tf.float32) output_projection = (w, b) def sampled_loss(labels, inputs): #使用候选采样损失函数 labels = tf.reshape(labels, [-1, 1]) #需要使用32位浮点数来计算sampled_softmax_loss,以避免数值不稳定性。 local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_samples, num_classes=self.target_vocab_size), tf.float32) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = tf.contrib.rnn.GRUCell(size) if use_lstm: single_cell = tf.contrib.rnn.BasicLSTMCell(size, state_is_tuple=True) cell = single_cell if num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell([single_cell] * num_layers, state_is_tuple=True) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq( encoder_inputs, #输入的句子 decoder_inputs, #输出的句子 cell, #使用的cell,lstm 或者GRU num_encoder_symbols=source_vocab_size, #源字典的大小 num_decoder_symbols=target_vocab_size, #转换后的目的字典的大小 embedding_size=size, #embedding 的纬度 output_projection=output_projection, #看字典大小 feed_previous=do_decode, #进行训练还是测试 dtype=tf.float32) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) #这里为列表对象中的每一个元素表示一个占位符,名称分别为encoder0、encoder1、encoder2... for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # target_weights 是一个与 decoder_outputs 大小一样的 0-1 矩阵。该矩阵将目标序列长度以外的其他位置填充为标量值 0。 # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] #将 decoder input向右平铺一个单位 # Training outputs and losses. if forward_only: self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer( self.learning_rate) #使用梯度下降法优化 for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) #计算损失函数关于参数的梯度 clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) #这里用来防止梯度爆炸 self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) #这里用来更新参数 self.saver = tf.train.Saver( tf.global_variables()) # tf.all_variables() depreciated
def __init__(self, flags, vocab_size, is_training=True): batch_size = flags.batch_size unroll = flags.unroll self._x = tf.placeholder(tf.int32, [batch_size, unroll]) self._y = tf.placeholder(tf.int32, [batch_size, unroll]) self._len = tf.placeholder(tf.int32, [ None, ]) lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(flags.hidden_dim, forget_bias=1.0, state_is_tuple=True) if is_training and flags.drop_prob > 0: lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=1.0 - flags.drop_prob) cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * flags.layers, state_is_tuple=True) self._initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): self.embeddings = tf.get_variable("embeddings", [vocab_size, flags.hidden_dim]) inputs = tf.nn.embedding_lookup(self.embeddings, self._x) if is_training and flags.drop_prob > 0: inputs = tf.nn.dropout(inputs, 1.0 - flags.drop_prob) # These options (fixed unroll or dynamic_rnn) should give same results but # using fixed here since faster if True: outputs = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(unroll): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) outputs = tf.reshape(tf.concat(1, outputs), [-1, flags.hidden_dim]) else: with tf.variable_scope("RNN"): outputs, state = tf.nn.dynamic_rnn( cell, inputs, sequence_length=self._len, initial_state=self._initial_state, dtype=tf.float32, time_major=False) outputs = tf.reshape(outputs, [-1, flags.hidden_dim]) softmax_w = tf.get_variable("softmax_w", [flags.hidden_dim, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.matmul(outputs, softmax_w) + softmax_b seq_loss = tf.nn.seq2seq.sequence_loss_by_example( [tf.reshape(logits, [-1, vocab_size])], [tf.reshape(self._y, [-1])], [tf.ones([batch_size * unroll])]) self.loss = tf.reduce_sum(seq_loss) / batch_size self._final_state = state if not is_training: return self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() shapes = [tvar.get_shape() for tvar in tvars] log_info("# params: %d" % np.sum([np.prod(s) for s in shapes])) grads = tf.gradients(self.loss, tvars) if flags.clip_norm is not None: grads, grads_norm = tf.clip_by_global_norm(grads, flags.clip_norm) else: grads_norm = tf.global_norm(grads) optimizer = get_optimizer(flags.optimizer)(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) # Summaries for TensorBoard, note this is only within training portion with tf.name_scope("summaries"): tf.scalar_summary("loss", self.loss / unroll) tf.scalar_summary("learning_rate", self.lr) tf.scalar_summary("grads_norm", grads_norm)
def __init__(self, user_count, item_count, cate_count, cate_list, predict_batch_size, predict_ads_num): self.u = tf.placeholder(tf.int32, [ None, ]) # [B] self.i = tf.placeholder(tf.int32, [ None, ]) # [B] self.j = tf.placeholder(tf.int32, [ None, ]) # [B] self.y = tf.placeholder(tf.float32, [ None, ]) # [B] self.hist_i = tf.placeholder(tf.int32, [None, None]) # [B, T] self.sl = tf.placeholder(tf.int32, [ None, ]) # [B] self.lr = tf.placeholder(tf.float64, []) hidden_units = 128 user_emb_w = tf.get_variable("user_emb_w", [user_count, hidden_units]) item_emb_w = tf.get_variable("item_emb_w", [item_count, hidden_units // 2]) item_b = tf.get_variable("item_b", [item_count], initializer=tf.constant_initializer(0.0)) cate_emb_w = tf.get_variable("cate_emb_w", [cate_count, hidden_units // 2]) cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64) ic = tf.gather(cate_list, self.i) i_emb = tf.concat(values=[ tf.nn.embedding_lookup(item_emb_w, self.i), tf.nn.embedding_lookup(cate_emb_w, ic), ], axis=1) i_b = tf.gather(item_b, self.i) jc = tf.gather(cate_list, self.j) j_emb = tf.concat([ tf.nn.embedding_lookup(item_emb_w, self.j), tf.nn.embedding_lookup(cate_emb_w, jc), ], axis=1) j_b = tf.gather(item_b, self.j) hc = tf.gather(cate_list, self.hist_i) h_emb = tf.concat([ tf.nn.embedding_lookup(item_emb_w, self.hist_i), tf.nn.embedding_lookup(cate_emb_w, hc), ], axis=2) hist_i = attention(i_emb, h_emb, self.sl) #-- attention end --- hist_i = tf.layers.batch_normalization(inputs=hist_i) hist_i = tf.reshape(hist_i, [-1, hidden_units], name='hist_bn') hist_i = tf.layers.dense(hist_i, hidden_units, name='hist_fcn') u_emb_i = hist_i hist_j = attention(j_emb, h_emb, self.sl) #-- attention end --- hist_j = tf.layers.batch_normalization(inputs=hist_j, reuse=True) hist_j = tf.reshape(hist_j, [-1, hidden_units], name='hist_bn') hist_j = tf.layers.dense(hist_j, hidden_units, name='hist_fcn', reuse=True) u_emb_j = hist_j print(u_emb_i.get_shape().as_list()) print(u_emb_j.get_shape().as_list()) print(i_emb.get_shape().as_list()) print(j_emb.get_shape().as_list()) #-- fcn begin ------- din_i = tf.concat([u_emb_i, i_emb, u_emb_i * i_emb], axis=-1) din_i = tf.layers.batch_normalization(inputs=din_i, name='b1') #d_layer_1_i = tf.layers.dense(din_i, 80, activation=tf.nn.sigmoid, name='f1') #if u want try dice change sigmoid to None and add dice layer like following two lines. You can also find model_dice.py in this folder. d_layer_1_i = tf.layers.dense(din_i, 80, activation=None, name='f1') d_layer_1_i = dice(d_layer_1_i, name='dice_1') d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=None, name='f2') d_layer_2_i = dice(d_layer_2_i, name='dice_2') d_layer_3_i = tf.layers.dense(d_layer_2_i, 1, activation=None, name='f3') din_j = tf.concat([u_emb_j, j_emb, u_emb_j * j_emb], axis=-1) din_j = tf.layers.batch_normalization(inputs=din_j, name='b1', reuse=True) d_layer_1_j = tf.layers.dense(din_j, 80, activation=None, name='f1', reuse=True) d_layer_1_j = dice(d_layer_1_j, name='dice_1') d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=None, name='f2', reuse=True) d_layer_2_j = dice(d_layer_2_j, name='dice_2') d_layer_3_j = tf.layers.dense(d_layer_2_j, 1, activation=None, name='f3', reuse=True) d_layer_3_i = tf.reshape(d_layer_3_i, [-1]) d_layer_3_j = tf.reshape(d_layer_3_j, [-1]) x = i_b - j_b + d_layer_3_i - d_layer_3_j # [B] self.logits = i_b + d_layer_3_i # prediciton for selected items # logits for selected item: item_emb_all = tf.concat( [item_emb_w, tf.nn.embedding_lookup(cate_emb_w, cate_list)], axis=1) item_emb_sub = item_emb_all[:predict_ads_num, :] item_emb_sub = tf.expand_dims(item_emb_sub, 0) item_emb_sub = tf.tile(item_emb_sub, [predict_batch_size, 1, 1]) hist_sub = attention_multi_items(item_emb_sub, h_emb, self.sl) #-- attention end --- hist_sub = tf.layers.batch_normalization(inputs=hist_sub, name='hist_bn', reuse=tf.AUTO_REUSE) # print hist_sub.get_shape().as_list() hist_sub = tf.reshape(hist_sub, [-1, hidden_units]) hist_sub = tf.layers.dense(hist_sub, hidden_units, name='hist_fcn', reuse=tf.AUTO_REUSE) u_emb_sub = hist_sub item_emb_sub = tf.reshape(item_emb_sub, [-1, hidden_units]) din_sub = tf.concat( [u_emb_sub, item_emb_sub, u_emb_sub * item_emb_sub], axis=-1) din_sub = tf.layers.batch_normalization(inputs=din_sub, name='b1', reuse=True) d_layer_1_sub = tf.layers.dense(din_sub, 80, activation=tf.nn.sigmoid, name='f1', reuse=True) #d_layer_1_sub = dice(d_layer_1_sub, name='dice_1_sub') d_layer_2_sub = tf.layers.dense(d_layer_1_sub, 40, activation=tf.nn.sigmoid, name='f2', reuse=True) #d_layer_2_sub = dice(d_layer_2_sub, name='dice_2_sub') d_layer_3_sub = tf.layers.dense(d_layer_2_sub, 1, activation=None, name='f3', reuse=True) d_layer_3_sub = tf.reshape(d_layer_3_sub, [-1, predict_ads_num]) self.logits_sub = tf.sigmoid(item_b[:predict_ads_num] + d_layer_3_sub) self.logits_sub = tf.reshape(self.logits_sub, [-1, predict_ads_num, 1]) #-- fcn end ------- self.mf_auc = tf.reduce_mean(tf.to_float(x > 0)) self.score_i = tf.sigmoid(i_b + d_layer_3_i) self.score_j = tf.sigmoid(j_b + d_layer_3_j) self.score_i = tf.reshape(self.score_i, [-1, 1]) self.score_j = tf.reshape(self.score_j, [-1, 1]) self.p_and_n = tf.concat([self.score_i, self.score_j], axis=-1) print(self.p_and_n.get_shape().as_list()) # Step variable self.global_step = tf.Variable(0, trainable=False, name='global_step') self.global_epoch_step = \ tf.Variable(0, trainable=False, name='global_epoch_step') self.global_epoch_step_op = \ tf.assign(self.global_epoch_step, self.global_epoch_step+1) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.y)) trainable_params = tf.trainable_variables() self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr) gradients = tf.gradients(self.loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm(gradients, 5) self.train_op = self.opt.apply_gradients(zip(clip_gradients, trainable_params), global_step=self.global_step)
def build_train_graph(self, inputs, min_depth, max_depth, cube_res, theta_res, phi_res, r_res, scale_factors, num_mpi_planes, learning_rate=0.0001, vgg_model_weights=None, global_step=0, depth_clip=20.0): """Construct the training computation graph. Args: inputs: dictionary of tensors (see 'input_data' below) needed for training min_depth: minimum depth for the PSV and MPI planes max_depth: maximum depth for the PSV and MPI planes cube_res: per-side cube resolution theta_res: environment map width phi_res: environment map height r_res: number of radii to use when sampling spheres for rendering scale_factors: downsampling factors of cubes relative to the coarsest num_mpi_planes: number of MPI planes to infer learning_rate: learning rate vgg_model_weights: vgg weights (needed when vgg loss is used) global_step: training iteration depth_clip: maximum depth for coarsest resampled volumes Returns: A train_op to be used for training. """ with tf.name_scope('setup'): psv_planes = pj.inv_depths(min_depth, max_depth, num_mpi_planes) mpi_planes = pj.inv_depths(min_depth, max_depth, num_mpi_planes) with tf.name_scope('input_data'): tgt_image = inputs['tgt_image'] ref_image = inputs['ref_image'] src_images = inputs['src_images'] env_image = inputs['env_image'] ref_depth = inputs['ref_depth'] tgt_pose = inputs['tgt_pose'] ref_pose = inputs['ref_pose'] src_poses = inputs['src_poses'] env_pose = inputs['env_pose'] intrinsics = inputs['intrinsics'] _, _, _, num_source = src_poses.get_shape().as_list() with tf.name_scope('inference'): num_mpi_planes = tf.shape(mpi_planes)[0] pred = self.infer_mpi(src_images, ref_image, ref_pose, src_poses, intrinsics, psv_planes) rgba_layers = pred['rgba_layers'] psv = pred['psv'] with tf.name_scope('synthesis'): output_image, output_alpha_acc, _ = self.mpi_render_view( rgba_layers, ref_pose, tgt_pose, mpi_planes, intrinsics) with tf.name_scope('environment_rendering'): mpi_gt = self.img2mpi(ref_image, ref_depth, mpi_planes) output_image_gt, _, _ = self.mpi_render_view( mpi_gt, ref_pose, tgt_pose, mpi_planes, intrinsics) lightvols_gt, _, _, _, _ = self.predict_lighting_vol( mpi_gt, mpi_planes, intrinsics, cube_res, scale_factors, depth_clip=depth_clip) lightvols, lightvol_centers, \ lightvol_side_lengths, \ cube_rel_shapes, \ cube_nest_inds = self.predict_lighting_vol(rgba_layers, mpi_planes, intrinsics, cube_res, scale_factors, depth_clip=depth_clip) lightvols_out = nets.cube_net_multires(lightvols, cube_rel_shapes, cube_nest_inds) gt_envmap, gt_shells = self.render_envmap( lightvols_gt, lightvol_centers, lightvol_side_lengths, cube_rel_shapes, cube_nest_inds, ref_pose, env_pose, theta_res, phi_res, r_res) prenet_envmap, prenet_shells = self.render_envmap( lightvols, lightvol_centers, lightvol_side_lengths, cube_rel_shapes, cube_nest_inds, ref_pose, env_pose, theta_res, phi_res, r_res) output_envmap, output_shells = self.render_envmap( lightvols_out, lightvol_centers, lightvol_side_lengths, cube_rel_shapes, cube_nest_inds, ref_pose, env_pose, theta_res, phi_res, r_res) with tf.name_scope('loss'): # mask loss for pixels outside reference frustum loss_mask = tf.where( tf.equal(output_alpha_acc[Ellipsis, tf.newaxis], 0.0), tf.zeros_like(output_image[:, :, :, 0:1]), tf.ones_like(output_image[:, :, :, 0:1])) loss_mask = tf.stop_gradient(loss_mask) tf.summary.image('loss_mask', loss_mask) # helper functions for loss def compute_error(real, fake, mask): mask = tf.ones_like(real) * mask return tf.reduce_sum(mask * tf.abs(fake - real)) / ( tf.reduce_sum(mask) + 1.0e-8) # Normalized VGG loss def downsample(tensor, ds): return tf.nn.avg_pool(tensor, [1, ds, ds, 1], [1, ds, ds, 1], 'SAME') def vgg_loss(tgt_image, output_image, loss_mask, vgg_weights): """VGG activation loss definition.""" vgg_real = nets.build_vgg19(tgt_image * 255.0, vgg_weights) rescaled_output_image = output_image * 255.0 vgg_fake = nets.build_vgg19(rescaled_output_image, vgg_weights) p0 = compute_error(vgg_real['input'], vgg_fake['input'], loss_mask) p1 = compute_error(vgg_real['conv1_2'], vgg_fake['conv1_2'], loss_mask) / 2.6 p2 = compute_error(vgg_real['conv2_2'], vgg_fake['conv2_2'], downsample(loss_mask, 2)) / 4.8 p3 = compute_error(vgg_real['conv3_2'], vgg_fake['conv3_2'], downsample(loss_mask, 4)) / 3.7 p4 = compute_error(vgg_real['conv4_2'], vgg_fake['conv4_2'], downsample(loss_mask, 8)) / 5.6 p5 = compute_error(vgg_real['conv5_2'], vgg_fake['conv5_2'], downsample(loss_mask, 16)) * 10 / 1.5 total_loss = p0 + p1 + p2 + p3 + p4 + p5 return total_loss # rendered image loss render_loss = vgg_loss(tgt_image, output_image, loss_mask, vgg_model_weights) / 100.0 total_loss = render_loss # rendered envmap loss envmap_loss = vgg_loss(env_image, output_envmap[Ellipsis, :3], tf.ones_like(env_image[Ellipsis, 0:1]), vgg_model_weights) / 100.0 # set envmap loss to 0 when only training mpi network (see paper) envmap_loss = tf.where(tf.greater(global_step, 240000), envmap_loss, 0.0) total_loss += envmap_loss # adversarial loss for envmap real_logit = nets.discriminator(env_image, scope='discriminator') fake_logit = nets.discriminator(output_envmap[Ellipsis, :3], scope='discriminator') adv_loss_list = [] for i in range(len(fake_logit)): adv_loss_list.append(0.1 * -1.0 * tf.reduce_mean(fake_logit[i][-1])) adv_loss = tf.reduce_mean(adv_loss_list) real_loss_list = [] fake_loss_list = [] for i in range(len(fake_logit)): real_loss_list.append( -1.0 * tf.reduce_mean(tf.minimum(real_logit[i][-1] - 1, 0.0))) fake_loss_list.append(-1.0 * tf.reduce_mean( tf.minimum(-1.0 * fake_logit[i][-1] - 1, 0.0))) real_loss = tf.reduce_mean(real_loss_list) fake_loss = tf.reduce_mean(fake_loss_list) disc_loss = real_loss + fake_loss # set adv/disc losses to 0 until end of training adv_loss = tf.where(tf.greater(global_step, 690000), adv_loss, 0.0) disc_loss = tf.where(tf.greater(global_step, 690000), disc_loss, 0.0) tf.summary.scalar('loss_disc', disc_loss) tf.summary.scalar('loss_disc_real', real_loss) tf.summary.scalar('loss_disc_fake', fake_loss) tf.summary.scalar('loss_adv', adv_loss) total_loss += adv_loss with tf.name_scope('train_op'): train_variables = [ var for var in tf.trainable_variables() if 'discriminator' not in var.name ] optim = tf.train.AdamOptimizer(learning_rate, epsilon=1e-4) grads_and_variables = optim.compute_gradients( total_loss, var_list=train_variables) grads = [gv[0] for gv in grads_and_variables] variables = [gv[1] for gv in grads_and_variables] def denan(x): return tf.where(tf.is_nan(x), tf.zeros_like(x), x) grads_clipped = [denan(g) for g in grads] grads_clipped, _ = tf.clip_by_global_norm(grads_clipped, 100.0) train_op = [optim.apply_gradients(zip(grads_clipped, variables))] tf.summary.scalar('gradient global norm', tf.linalg.global_norm(grads)) tf.summary.scalar('clipped gradient global norm', tf.linalg.global_norm(grads_clipped)) d_variables = [ var for var in tf.trainable_variables() if 'discriminator' in var.name ] optim_d = tf.train.AdamOptimizer(learning_rate, beta1=0.0) train_op.append(optim_d.minimize(disc_loss, var_list=d_variables)) with tf.name_scope('envmap_gt'): tf.summary.image('envmap', gt_envmap) tf.summary.image('envmap_alpha', gt_envmap[Ellipsis, -1:]) for i in range(len(gt_shells)): i_envmap = pj.over_composite(gt_shells[i]) tf.summary.image('envmap_level_' + str(i), i_envmap) with tf.name_scope('envmap_prenet'): tf.summary.image('envmap', prenet_envmap) tf.summary.image('envmap_alpha', prenet_envmap[Ellipsis, -1:]) for i in range(len(prenet_shells)): i_envmap = pj.over_composite(prenet_shells[i]) tf.summary.image('envmap_level_' + str(i), i_envmap) with tf.name_scope('envmap_output'): tf.summary.image('envmap', output_envmap) tf.summary.image('envmap_alpha', output_envmap[Ellipsis, -1:]) for i in range(len(output_shells)): i_envmap = pj.over_composite(output_shells[i]) tf.summary.image('envmap_level_' + str(i), i_envmap) tf.summary.scalar('loss_total', total_loss) tf.summary.scalar('loss_render', render_loss) tf.summary.scalar('loss_envmap', envmap_loss) tf.summary.scalar('min_depth', min_depth) tf.summary.scalar('max_depth', max_depth) with tf.name_scope('level_stats'): for i in range(len(lightvols)): tf.summary.scalar('cube_side_length_' + str(i), lightvol_side_lengths[i]) tf.summary.scalar('cube_center_' + str(i), lightvol_centers[i][0, -1]) # Source images for i in range(num_source): src_image = src_images[:, :, :, i * 3:(i + 1) * 3] tf.summary.image('image_src_%d' % i, src_image) # Output image tf.summary.image('image_output', output_image) tf.summary.image('image_output_Gt', output_image_gt) # Target image tf.summary.image('image_tgt', tgt_image) tf.summary.image('envmap_tgt', env_image) # Ref image tf.summary.image('image_ref', ref_image) # Predicted color and alpha layers, and PSV num_summ = 8 # number of plane summaries to show in tensorboard for i in range(num_summ): ind = tf.to_int32(i * num_mpi_planes / num_summ) rgb = rgba_layers[:, :, :, ind, :3] alpha = rgba_layers[:, :, :, ind, -1:] ref_plane = psv[:, :, :, ind, :3] source_plane = psv[:, :, :, ind, 3:6] tf.summary.image('layer_rgb_%d' % i, rgb) tf.summary.image('layer_alpha_%d' % i, alpha) tf.summary.image('layer_rgba_%d' % i, rgba_layers[:, :, :, ind, :]) tf.summary.image('psv_avg_%d' % i, 0.5 * ref_plane + 0.5 * source_plane) tf.summary.image('psv_ref_%d' % i, ref_plane) tf.summary.image('psv_source_%d' % i, source_plane) return train_op
def __init__(self, num_gpus=1, res_block_nums=7): # self.ckpt = os.path.join(os.getcwd(), 'models/best_model.ckpt-13999') # TODO self.num_gpus = num_gpus self.save_dir = "./gpu_models" self.is_logging = True self.res_block_nums = res_block_nums """reset TF Graph""" tf.reset_default_graph() """Creat a new graph for the network""" # g = tf.Graph() config = tf.ConfigProto(inter_op_parallelism_threads=4, intra_op_parallelism_threads=4) config.gpu_options.allow_growth = True config.allow_soft_placement = True """Assign a Session that excute the network""" # config.gpu_options.per_process_gpu_memory_fraction = 0.75 # self.sess = tf.Session(config=config, graph=g) # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.75) # config = tf.ConfigProto(gpu_options=gpu_options) self.sess = tf.Session(config=config) # self.sess = tf.InteractiveSession() with tf.device('/cpu:0'): # Variables self.filters_size = 128 # or 256 self.prob_size = 2086 self.digest = None self.training = tf.placeholder(tf.bool, name='training') self.inputs_ = tf.placeholder( tf.float32, [None, 9, 10, 14], name='inputs') # + 2 # TODO C plain x 2 self.c_l2 = 0.0001 self.momentum = 0.9 self.global_norm = 100 self.learning_rate = tf.placeholder( tf.float32, name='learning_rate') #0.001 #5e-3 #0.05 # self.global_step = tf.Variable(0, name="global_step", trainable=False) # self.learning_rate = tf.maximum(tf.train.exponential_decay( # 0.001, self.global_step, 1e3, 0.66), 1e-5) # self.learning_rate = tf.Variable(self.hps.lrn_rate, dtype=tf.float32, trainable=False) tf.summary.scalar('learning_rate', self.learning_rate) # 优化损失 optimizer = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True) # , use_locking=True # optimizer = tf.train.AdamOptimizer(self.learning_rate) # First block self.pi_ = tf.placeholder(tf.float32, [None, self.prob_size], name='pi') self.z_ = tf.placeholder(tf.float32, [None, 1], name='z') # batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue([self.inputs_, self.pi_, self.z_], capacity=3 * self.num_gpus) inputs_batches = tf.split(self.inputs_, self.num_gpus, axis=0) pi_batches = tf.split(self.pi_, self.num_gpus, axis=0) z_batches = tf.split(self.z_, self.num_gpus, axis=0) tower_grads = [None] * self.num_gpus self.loss = 0 self.accuracy = 0 self.policy_head = [] self.value_head = [] with tf.variable_scope(tf.get_variable_scope()): """Build the core model within the graph.""" for i in range(self.num_gpus): with tf.device( self.assign_to_device( '/gpu:{}'.format(i), ps_device='/cpu:0')): #tf.device('/gpu:{i}'): with tf.name_scope('TOWER_{}'.format(i)) as scope: inputs_batch, pi_batch, z_batch = inputs_batches[ i], pi_batches[i], z_batches[ i] # batch_queue.dequeue() # # NWHC format # batch, 9 * 10, 14 channels # inputs_ = tf.reshape(self.inputs_, [-1, 9, 10, 14]) loss = self.tower_loss(inputs_batch, pi_batch, z_batch, i) # reuse variable happens here tf.get_variable_scope().reuse_variables() grad = optimizer.compute_gradients(loss) tower_grads[i] = grad self.loss /= self.num_gpus self.accuracy /= self.num_gpus grads = self.average_gradients(tower_grads) # defensive step 2 to clip norm clipped_grads, self.norm = tf.clip_by_global_norm( [g for g, _ in grads], self.global_norm) # defensive step 3 check NaN # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [ tf.check_numerics(g, message='NaN Found!') for g in clipped_grads ] with tf.control_dependencies(grad_check): self.train_op = optimizer.apply_gradients( zip(clipped_grads, [v for _, v in grads]), global_step=self.global_step, name='train_step') if self.is_logging: for grad, var in grads: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) self.summaries_op = tf.summary.merge_all() # Train Summaries self.train_writer = tf.summary.FileWriter( os.path.join(os.getcwd(), "cchesslogs/train"), self.sess.graph) # Test summaries self.test_writer = tf.summary.FileWriter( os.path.join(os.getcwd(), "cchesslogs/test"), self.sess.graph) self.sess.run(tf.global_variables_initializer()) # self.sess.run(tf.local_variables_initializer()) # self.sess.run(tf.initialize_all_variables()) self.saver = tf.train.Saver() self.train_restore()
def __init__(self, source_vocab_size, target_vocab_size, buckets, hidden_size, embedding_size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, forward_only=False): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # Create the internal multi-layer cell for our RNN. single_cell = tf.nn.rnn_cell.GRUCell(hidden_size) if use_lstm: single_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size) cell = single_cell if num_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf.nn.seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=embedding_size, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, forward_only)) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables(), max_to_keep=1000)
def __init__(self, s_size, a_size, scope, trainer): with tf.variable_scope(scope): # Input and visual encoding layers self.inputs = tf.placeholder(shape=[None, s_size], dtype=tf.float32) self.imageIn = tf.reshape(self.inputs, shape=[-1, s_shape[0], s_shape[1], 1]) self.conv1 = slim.conv2d(activation_fn=tf.nn.elu, inputs=self.imageIn, num_outputs=16, kernel_size=[2, 2], stride=[1, 1], padding='SAME') self.conv2 = slim.conv2d(activation_fn=tf.nn.elu, inputs=self.conv1, num_outputs=32, kernel_size=[2, 2], stride=[1, 1], padding='SAME') hidden = slim.fully_connected(slim.flatten(self.conv2), 256, activation_fn=tf.nn.elu) # Recurrent network for temporal dependencies lstm_cell = tf.contrib.rnn.BasicLSTMCell(256, state_is_tuple=True) c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) self.state_init = [c_init, h_init] c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c]) h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h]) self.state_in = (c_in, h_in) rnn_in = tf.expand_dims(hidden, [0]) step_size = tf.shape(self.imageIn)[:1] state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) lstm_outputs, lstm_state = tf.nn.dynamic_rnn( lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False) lstm_c, lstm_h = lstm_state self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) rnn_out = tf.reshape(lstm_outputs, [-1, 256]) # Output layers for policy and value estimations self.policy = slim.fully_connected( rnn_out, a_size, activation_fn=tf.nn.softmax, weights_initializer=normalized_columns_initializer(0.01), biases_initializer=None) self.value = slim.fully_connected( rnn_out, 1, activation_fn=None, weights_initializer=normalized_columns_initializer(1.0), biases_initializer=None) # Only the worker network need ops for loss functions and gradient updating. if scope != 'global': self.actions = tf.placeholder(shape=[None], dtype=tf.int32) self.actions_onehot = tf.one_hot(self.actions, a_size, dtype=tf.float32) self.target_v = tf.placeholder(shape=[None], dtype=tf.float32) self.advantages = tf.placeholder(shape=[None], dtype=tf.float32) self.responsible_outputs = tf.reduce_sum( self.policy * self.actions_onehot, [1]) # Loss functions self.value_loss = 0.5 * tf.reduce_sum( tf.square(self.target_v - tf.reshape(self.value, [-1]))) self.entropy = -tf.reduce_sum( self.policy * tf.log(self.policy)) self.policy_loss = -tf.reduce_sum( tf.log(self.responsible_outputs) * self.advantages) self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01 # Get gradients from local network using local losses local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm( self.gradients, 40.0) # Apply local gradients to global network global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients( zip(grads, global_vars))
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, o.numClass]) # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper. lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self._initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.split( 1, num_steps, tf.nn.embedding_lookup(embedding, self._input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] if is_training and config.keep_prob < 1: inputs = [ tf.nn.dropout(input_, config.keep_prob) for input_ in inputs ] # Simplified version of tensorflow.models.rnn.rnn.py's rnn(). # This builds an unrolled LSTM for tutorial purposes only. # In general, use the rnn() or state_saving_rnn() from rnn.py. # # The alternative version of the code below is: # # from tensorflow.models.rnn import rnn # outputs, states = rnn.rnn(cell, inputs, initial_state=self._initial_state) outputs = [] states = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step, input_ in enumerate(inputs): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(input_, state) outputs.append(cell_output) states.append(state) output = tf.reshape(tf.concat(1, outputs), [-1, size]) logits = tf.nn.xw_plus_b( output, tf.get_variable("softmax_w", [size, vocab_size]), tf.get_variable("softmax_b", [vocab_size])) loss = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])], vocab_size) self._cost = cost = tf.div(tf.reduce_sum(loss), batch_size) self._final_state = states[-1] if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, is_training, config): seq_width = config.seq_width n_steps = config.batch_size num_hidden = config.num_hidden num_layers = config.num_layers #tensors for input, target and sequence length placeholders self._seq_input = tf.placeholder(tf.float32, [n_steps, seq_width]) self._seq_target = tf.placeholder(tf.float32, [n_steps, 1]) self._early_stop = tf.placeholder(tf.int32) #inputs should be a list of tensors at each timestamp inputs = [ tf.reshape(data, (1, seq_width)) for data in tf.split(0, n_steps, self.seq_input) ] initializer = tf.random_uniform_initializer(-.1, .1) cell = rnn_cell.LSTMCell(num_hidden, seq_width, initializer=initializer) if num_layers > 1: cell = rnn_cell.MultiRNNCell([cell] * num_layers) #initial state self._initial_state = cell.zero_state(1, tf.float32) outputs, states = rnn(cell, inputs, initial_state=self._initial_state, sequence_length=self._early_stop) #save final state of the rnn self._final_state = states[-1] #outputs originaly comes as a list of tensors, but we need a single tensor for tf.matmul outputs = tf.reshape(tf.concat(1, outputs), [-1, num_hidden]) #rnn outputs W = tf.get_variable('W', [num_hidden, 1]) b = tf.get_variable('b', [1]) _output = tf.matmul(outputs, W) + b self._output = _output #ops for least squares error computation error = tf.pow( tf.reduce_sum(tf.pow(tf.sub(_output, self._seq_target), 2)), .5) tf.scalar_summary("error", error) self._error = error self._merge_summaries_op = tf.merge_all_summaries() if not is_training: return #learning rate self._lr = tf.Variable(0., trainable='False', name='lr') #trainable variables for gradient computation tvars = tf.trainable_variables() #compute gradients grads, _ = tf.clip_by_global_norm(tf.gradients(self._error, tvars), config.max_grad_norm) #2 options here: either to use GradientDescentOptimizer (config.useGDO:True) or AdamOptimizer (config.useGDO:False) if config.useGDO: optimizer = tf.train.GradientDescentOptimizer(self._lr) else: optimizer = tf.train.AdamOptimizer(self._lr) #ops for training self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def main(): args = get_args() if args.l2_reg_strength == 0: args.l2_reg_strength = None logdir = os.path.join(args.logdir_root, 'train') coord = tf.train.Coordinator() # Create inputs with tf.name_scope('create_inputs'): reader = AudioReader(args.nb_data_dir, args.wb_data_dir, coord, sample_rate=args.sample_rate, sample_size=args.sample_size, silence_threshold=args.silence_threshold) nb_audio_batch, wb_audio_batch = reader.dequeue(args.batch_size) # Create model net = create_model(args) global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) optim = optimizer_factory[args.optimizer](learning_rate=args.learning_rate, momentum=args.momentum) # Set up placeholders and variables on each GPU tower_grads = [] losses = [] wb_input_batch_rnn = [] nb_input_batch_rnn = [] train_big_frame_state = [] train_frame_state = [] final_big_frame_state = [] final_frame_state = [] # --- ADDED --- ''' target = [] prediction = [] ''' # ------------- for i in xrange(args.num_gpus): with tf.device('/gpu:%d' % (i)): ''' target.append( tf.Variable(tf.zeros([net.seq_len * net.batch_size, 256]), trainable=False, name='target_output_rnn', dtype=tf.float32)) prediction.append( tf.Variable(tf.zeros([net.seq_len * net.batch_size, 256]), trainable=False, name='prediction', dtype=tf.float32)) ''' # Create input placeholders nb_input_batch_rnn.append( tf.Variable(tf.zeros([net.batch_size, net.seq_len, 1]), trainable=False, name='nb_input_batch_rnn', dtype=tf.float32)) wb_input_batch_rnn.append( tf.Variable(tf.zeros([net.batch_size, net.seq_len, 1]), trainable=False, name='wb_input_batch_rnn', dtype=tf.float32)) # Create initial states train_big_frame_state.append( net.big_cell.zero_state(net.batch_size, tf.float32)) final_big_frame_state.append( net.big_cell.zero_state(net.batch_size, tf.float32)) train_frame_state.append( net.cell.zero_state(net.batch_size, tf.float32)) final_frame_state.append( net.cell.zero_state(net.batch_size, tf.float32)) with tf.variable_scope(tf.get_variable_scope()): for i in xrange(args.num_gpus): with tf.device('/gpu:%d' % (i)): with tf.name_scope('TOWER_%d' % i) as scope: # Create variables print("Creating model on GPU:%d" % i) loss, final_big_frame_state[i], final_frame_state[i] = \ net.loss_HRNN(nb_input_batch_rnn[i], wb_input_batch_rnn[i], train_big_frame_state[i], train_frame_state[i], l2_reg_strength=args.l2_reg_strength) tf.get_variable_scope().reuse_variables() losses.append(loss) # Reuse variables for the next tower trainable = tf.trainable_variables() gradients = optim.compute_gradients( loss, trainable, aggregation_method=tf.AggregationMethod. EXPERIMENTAL_ACCUMULATE_N) tower_grads.append(gradients) grad_vars = average_gradients(tower_grads) grads, vars = zip(*grad_vars) grads_clipped, _ = tf.clip_by_global_norm(grads, 5.0) grad_vars = zip(grads_clipped, vars) apply_gradient_op = optim.apply_gradients(grad_vars, global_step=global_step) # ----------------------------------------------------------------------- # Start/continue training # ----------------------------------------------------------------------- writer = tf.summary.FileWriter(logdir) writer.add_graph(tf.get_default_graph()) summaries = tf.summary.merge_all() # Configure session tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) init = tf.global_variables_initializer() sess.run(init) # Load checkpoint saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=args.max_checkpoints) try: saved_global_step = load(saver, sess, logdir) if saved_global_step is None: saved_global_step = -1 except: print("Something went wrong while restoring checkpoint.") raise # Start queue runners threads = tf.train.start_queue_runners(sess=sess, coord=coord) reader.start_threads(sess) # Train step = None last_saved_step = saved_global_step infe_para = create_gen_wav_para(net) try: for step in range(saved_global_step + 1, args.num_steps): final_big_s = [] final_s = [] for g in xrange(args.num_gpus): # Initialize cells final_big_s.append(sess.run(net.big_initial_state)) final_s.append(sess.run(net.initial_state)) start_time = time.time() nb_inputs_list = [] wb_inputs_list = [] for _ in xrange(args.num_gpus): # Get input batches nb_inputs, wb_inputs = sess.run( [nb_audio_batch, wb_audio_batch]) nb_inputs_list.append(nb_inputs) wb_inputs_list.append(wb_inputs) loss_sum = 0 idx_begin = 0 audio_length = args.sample_size - args.big_frame_size bptt_length = args.seq_len - args.big_frame_size stateful_rnn_length = audio_length / bptt_length output_list = [ summaries, losses, apply_gradient_op, final_big_frame_state, final_frame_state ] for i in range(0, stateful_rnn_length): inp_dict = {} for g in xrange(args.num_gpus): # Add seq_len samples as input for truncated BPTT inp_dict[nb_input_batch_rnn[g]] = \ nb_inputs_list[g][:, idx_begin:idx_begin+args.seq_len, :] inp_dict[wb_input_batch_rnn[g]] = \ wb_inputs_list[g][:, idx_begin:idx_begin+args.seq_len, :] inp_dict[train_big_frame_state[g]] = final_big_s[g] inp_dict[train_frame_state[g]] = final_s[g] idx_begin += args.seq_len - args.big_frame_size # Forward pass summary, loss_gpus, _, final_big_s, final_s = \ sess.run(output_list, feed_dict=inp_dict) writer.add_summary(summary, step) for g in xrange(args.num_gpus): loss_gpu = loss_gpus[g] / stateful_rnn_length loss_sum += loss_gpu / args.num_gpus duration = time.time() - start_time print("Step {:d}: loss = {:.3f}, ({:.3f} sec/step)".format( step, loss_sum, duration)) if step % args.ckpt_every == 0: save(saver, sess, logdir, step) last_saved_step = step # Generate waveforms every 20 steps #if (step) % 20 == 0 and step >= 20: generate_and_save_samples(step, net, infe_para, sess, nb_inputs_list[0]) except KeyboardInterrupt: print() finally: if step > last_saved_step: print('Saving model...') save(saver, sess, logdir, step) coord.request_stop() coord.join(threads)
def __init__(self, embedding_size, rnn_size, layer_size, vocab_size, attn_size, sequence_length, n_classes, grad_clip, learning_rate): """ - embedding_size: word embedding dimension - rnn_size : hidden state dimension - layer_size : number of rnn layers - vocab_size : vocabulary size - attn_size : attention layer dimension - sequence_length : max sequence length - n_classes : number of target labels - grad_clip : gradient clipping threshold - learning_rate : initial learning rate """ self.output_keep_prob = tf.placeholder(tf.float32, name='output_keep_prob') self.input_data = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_data') self.targets = tf.placeholder(tf.float32, shape=[None, n_classes], name='targets') # 定义前向RNN Cell with tf.name_scope('fw_rnn'), tf.variable_scope('fw_rnn'): print tf.get_variable_scope().name lstm_fw_cell_list = [ tf.contrib.rnn.LSTMCell(rnn_size) for _ in xrange(layer_size) ] lstm_fw_cell_m = tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.MultiRNNCell(lstm_fw_cell_list), output_keep_prob=self.output_keep_prob) # 定义反向RNN Cell with tf.name_scope('bw_rnn'), tf.variable_scope('bw_rnn'): print tf.get_variable_scope().name lstm_bw_cell_list = [ tf.contrib.rnn.LSTMCell(rnn_size) for _ in xrange(layer_size) ] lstm_bw_cell_m = tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.MultiRNNCell(lstm_fw_cell_list), output_keep_prob=self.output_keep_prob) with tf.device('/cpu:0'): embedding = tf.Variable(tf.truncated_normal( [vocab_size, embedding_size], stddev=0.1), name='embedding') inputs = tf.nn.embedding_lookup(embedding, self.input_data) # self.input_data shape: (batch_size , sequence_length) # inputs shape : (batch_size , sequence_length , rnn_size) # bidirection rnn 的inputs shape 要求是(sequence_length, batch_size, rnn_size) # 因此这里需要对inputs做一些变换 # 经过transpose的转换已经将shape变为(sequence_length, batch_size, rnn_size) # 只是双向rnn接受的输入必须是一个list,因此还需要后续两个步骤的变换 inputs = tf.transpose(inputs, [1, 0, 2]) # 转换成(batch_size * sequence_length, rnn_size) inputs = tf.reshape(inputs, [-1, rnn_size]) # 转换成list,里面的每个元素是(batch_size, rnn_size) inputs = tf.split(inputs, sequence_length, 0) with tf.name_scope('bi_rnn'), tf.variable_scope('bi_rnn'): outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn( lstm_fw_cell_m, lstm_bw_cell_m, inputs, dtype=tf.float32) # 定义attention layer attention_size = attn_size with tf.name_scope('attention'), tf.variable_scope('attention'): attention_w = tf.Variable(tf.truncated_normal( [2 * rnn_size, attention_size], stddev=0.1), name='attention_w') attention_b = tf.Variable(tf.constant(0.1, shape=[attention_size]), name='attention_b') u_list = [] for t in xrange(sequence_length): u_t = tf.tanh(tf.matmul(outputs[t], attention_w) + attention_b) u_list.append(u_t) u_w = tf.Variable(tf.truncated_normal([attention_size, 1], stddev=0.1), name='attention_uw') attn_z = [] for t in xrange(sequence_length): z_t = tf.matmul(u_list[t], u_w) attn_z.append(z_t) # transform to batch_size * sequence_length attn_zconcat = tf.concat(attn_z, axis=1) self.alpha = tf.nn.softmax(attn_zconcat) # transform to sequence_length * batch_size * 1 , same rank as outputs alpha_trans = tf.reshape(tf.transpose(self.alpha, [1, 0]), [sequence_length, -1, 1]) self.final_output = tf.reduce_sum(outputs * alpha_trans, 0) print self.final_output.shape # outputs shape: (sequence_length, batch_size, 2*rnn_size) fc_w = tf.Variable(tf.truncated_normal([2 * rnn_size, n_classes], stddev=0.1), name='fc_w') fc_b = tf.Variable(tf.zeros([n_classes]), name='fc_b') #self.final_output = outputs[-1] # 用于分类任务, outputs取最终一个时刻的输出 self.logits = tf.matmul(self.final_output, fc_w) + fc_b self.prob = tf.nn.softmax(self.logits) self.cost = tf.losses.softmax_cross_entropy(self.targets, self.logits) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), grad_clip) optimizer = tf.train.AdamOptimizer(learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.accuracy = tf.reduce_mean( tf.cast( tf.equal(tf.argmax(self.targets, axis=1), tf.argmax(self.prob, axis=1)), tf.float32))
def __init__(self, word_embedding, word_to_idx=None, use_glove=True, is_training=True, dim_feat=2048, config=Config(), num_input=2): self.x = tf.placeholder(tf.int32, [None, config.num_steps]) self.y_ = tf.placeholder(tf.float32, [None, 2]) self.img_feat = tf.placeholder(tf.float32, [None, dim_feat]) self.lr = tf.placeholder(tf.float32) self._eos = word_to_idx['<eos>'] mask = tf.to_float(tf.equal(self.x, self._eos)) num_steps = config.num_steps hidden_size = config.hidden_size vocab_size = config.vocab_size embedding_size = config.embedding_size num_input = config.num_input use_img_feat = config.use_img_feat use_lstm = config.use_lstm combine_typ = config.combine_typ cls_hidden = config.cls_hidden use_residual = config.use_residual img_feat = tf.layers.dense(inputs=self.img_feat, units=hidden_size, activation=None) if use_residual: def lstm_cell(): return ResidualWrapper( tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=1.0, state_is_tuple=True)) else: def lstm_cell(): return tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=1.0, state_is_tuple=True) attn_cell = lstm_cell if is_training and config.dropout_prob < 1: def attn_cell(): return tf.contrib.rnn.DropoutWrapper( lstm_cell(), output_keep_prob=config.dropout_prob) cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in xrange(config.num_layers)], state_is_tuple=True) if use_glove: embedding = tf.get_variable( "embedding", dtype=tf.float32, initializer=tf.constant(word_embedding)) else: embedding = tf.get_variable( "embedding", [vocab_size, embedding_size], initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0)) inputs = tf.nn.embedding_lookup(embedding, self.x) if use_img_feat == 'concat_bf_lstm': raise Exception("use_img_feat=concat_bf_lstm not supported") img_reshape = tf.reshape(img_feat, [-1, 1, dim_feat]) img_tiled = tf.tile(img_reshape, [1, num_steps, 1]) inputs = tf.concat([inputs, img_tiled], 2) if is_training and config.dropout_prob < 1: inputs = tf.nn.dropout(inputs, config.dropout_prob) if use_lstm: ta_d_outputs = tf.TensorArray(dtype=tf.float32, size=num_steps, dynamic_size=False, infer_shape=True) state = cell.zero_state(tf.shape(inputs)[0], tf.float32) with tf.variable_scope("RNN"): for time_step in xrange(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (output, state) = cell(inputs[:, time_step, :], state) ta_d_outputs = ta_d_outputs.write(time_step, output) # batch_size x seq_length x hidden_size ta_d_outputs = tf.transpose(ta_d_outputs.stack(), perm=[1, 0, 2]) # apply the mask mask = tf.expand_dims(mask, -1) mask = tf.tile(mask, tf.stack([1, 1, hidden_size])) masked_out = ta_d_outputs * mask output = tf.reduce_sum(masked_out, axis=1) output_context, output_candidate = tf.split( output, num_or_size_splits=num_input, axis=0) else: inputs = tf.reshape(inputs, [-1, num_steps * embedding_size]) output_context, output_candidate = tf.split( inputs, num_or_size_splits=num_input, axis=0) print("-" * 80) if use_img_feat == 'concat_af_lstm': print( "Image feature concatenate after the contextfeature from LSTM") imgf_1, imgf_2 = tf.split(img_feat, num_or_size_splits=num_input, axis=0) output_context = tf.concat([imgf_1, output_context], axis=1) elif use_img_feat == 'only_img': print("Image Feature Replacing the Context Feature from LSTM") imgf_1, imgf_2 = tf.split(img_feat, num_or_size_splits=num_input, axis=0) output_context = imgf_1 else: print("Not using image feature") print("-" * 80) # Combining candidate information with context information print("-" * 80) if combine_typ == 'concat': print("Directly concatenate context and candidate feature.") output = tf.concat([output_context, output_candidate], axis=1) elif combine_typ == 'bilinpool': # compact bilinear print( "Use compact bilinear pooling between candidate/context features." ) out_dim = 8192 output_context = tf.expand_dims(tf.expand_dims(output_context, 1), 1) output_candidate = tf.expand_dims( tf.expand_dims(output_candidate, 1), 1) output = compact_bilinear_pooling(output_context, output_candidate, out_dim) output = tf.reshape(output, [-1, out_dim]) # make static time shape else: print("Use only the candidate feature.") output = output_candidate print("-" * 80) for _ in range(cls_hidden): output = tf.layers.dense(inputs=output, units=512, activation=tf.nn.relu) if is_training and config.dropout_prob < 1: output = tf.nn.dropout(output, config.dropout_prob) y = tf.layers.dense(inputs=output, units=2, activation=None) score = tf.nn.softmax(y, dim=-1, name=None) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=self.y_, logits=y)) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(self.y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) self._logits = y self._score = score self._loss = loss self._accuracy = accuracy if not is_training: return tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), config.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step())
def __init__(self, worker_index, env, render, num_local_steps, learning_rate, entropy_regularization, max_gradient_norm, discount, summary_writer, summary_update_interval): """An agent that learns to plan in gridworld using an A3C architecture. Args: worker_index: Index of the worker thread that is running this agent. env: A simulator object (see in /Env') that wraps over a pygame environment. render: Determines whether to display the game screen. num_local_steps: Number of experiences used per worker when updating the model. learning_rate: The speed with which the network learns from new examples. entropy_regularization: The strength of the entropy regularization term. max_gradient_norm: Maximum value allowed for the L2-norms of gradients. Gradients with norms that would otherwise surpass this value are scaled down. ? discount: Discount factor for future rewards. summary_writer: A TensorFlow object that writes summaries. summary_update_interval: Number of training steps needed to update the summary data. """ self.worker_index = worker_index self.env = env self.render = render self.num_local_steps = num_local_steps self.discount = discount self.summary_writer = summary_writer self.summary_update_interval = summary_update_interval self.num_times_trained = 0 device = "cpu" if USE_GPU: device = "gpu" worker_device = '/job:thread/task:{}/{}:0'.format(worker_index, device) # Get global parameters with tf.device( tf.train.replica_device_setter(1, '/job:master', worker_device)): # ps_tasks, ps_device, worker_device with tf.variable_scope('global'): self.global_network = a3c.PolicyNetwork() self.global_step = tf.get_variable('global_step', [], tf.int32, tf.constant_initializer( 0, tf.int32), trainable=False) with tf.device(worker_device): with tf.variable_scope('local'): self.local_network = a3c.PolicyNetwork() self.local_network.global_step = self.global_step self.action = tf.placeholder(tf.int32, [None], 'Action') self.advantage = tf.placeholder(tf.float32, [None], 'Advantage') self.discounted_reward = tf.placeholder(tf.float32, [None], 'Discounted_Reward') # Estimate the policy loss using the cross-entropy loss function. action_logits = self.local_network.action_logits # policy_loss part I: policy gradient policy_loss = tf.reduce_sum( self.advantage * tf.nn.sparse_softmax_cross_entropy_with_logits( logits=action_logits, labels=self.action)) # Regularize the policy loss by adding uncertainty (subtracting entropy). High entropy means # the agent is uncertain (meaning, it assigns similar probabilities to multiple actions). # Low entropy means the agent is sure of which action it should perform next. entropy = -tf.reduce_sum( tf.nn.softmax(action_logits) * tf.nn.log_softmax(action_logits)) # policy_loss part II: entropy loss policy_loss -= entropy_regularization * entropy # Estimate the value loss using the sum of squared errors. value_loss = tf.nn.l2_loss(self.local_network.value - self.discounted_reward) # Estimate the final loss. self.loss = policy_loss + 0.5 * value_loss # Fetch and clip the gradients of the local network. gradients = tf.gradients(self.loss, self.local_network.parameters) clipped_gradients, _ = tf.clip_by_global_norm( gradients, max_gradient_norm) # Update the global network using the clipped gradients. batch_size = tf.shape(self.local_network.s)[0] grads_and_vars = list( zip(clipped_gradients, self.global_network.parameters)) self.train_step = [ tf.train.AdamOptimizer(learning_rate).apply_gradients( grads_and_vars), self.global_step.assign_add(batch_size) ] # Synchronize the local network with the global network. self.reset_local_network = [ local_p.assign(global_p) for local_p, global_p in zip(self.local_network.parameters, self.global_network.parameters) ] tf.summary.scalar('model/loss', self.loss / tf.to_float(batch_size)) tf.summary.scalar('model/policy_loss', policy_loss / tf.to_float(batch_size)) tf.summary.scalar('model/value_loss', value_loss / tf.to_float(batch_size)) tf.summary.scalar('model/entropy', entropy / tf.to_float(batch_size)) tf.summary.scalar('model/global_norm', tf.global_norm(self.local_network.parameters)) tf.summary.scalar('model/gradient_global_norm', tf.global_norm(gradients)) self.summary_step = tf.summary.merge_all()
def train(train_dir, config, dataset_fn, checkpoints_to_keep=5, keep_checkpoint_every_n_hours=1, num_steps=None, master='', num_sync_workers=0, num_ps_tasks=0, task=0): """Train loop.""" tf.gfile.MakeDirs(train_dir) is_chief = (task == 0) if is_chief: _trial_summary(config.hparams, config.train_examples_path, train_dir) with tf.Graph().as_default(): with tf.device( tf.train.replica_device_setter(num_ps_tasks, merge_devices=True)): model = config.model model.build(config.hparams, config.data_converter.output_depth, is_training=True) optimizer = model.train(**_get_input_tensors(dataset_fn(), config)) hooks = [] if num_sync_workers: optimizer = tf.train.SyncReplicasOptimizer( optimizer, num_sync_workers) hooks.append(optimizer.make_session_run_hook(is_chief)) grads, var_list = zip(*optimizer.compute_gradients(model.loss)) global_norm = tf.global_norm(grads) tf.summary.scalar('global_norm', global_norm) if config.hparams.clip_mode == 'value': g = config.hparams.grad_clip clipped_grads = [ tf.clip_by_value(grad, -g, g) for grad in grads ] elif config.hparams.clip_mode == 'global_norm': clipped_grads = tf.cond( global_norm < config.hparams.grad_norm_clip_to_zero, lambda: tf.clip_by_global_norm( # pylint:disable=g-long-lambda grads, config.hparams.grad_clip, use_norm=global_norm)[0], lambda: [tf.zeros(tf.shape(g)) for g in grads]) else: raise ValueError('Unknown clip_mode: {}'.format( config.hparams.clip_mode)) train_op = optimizer.apply_gradients(zip(clipped_grads, var_list), global_step=model.global_step, name='train_step') logging_dict = { 'global_step': model.global_step, 'loss': model.loss } print("logging global step: ", logging_dict['global_step'], " loss at this point: ", logging['loss'], " \n") hooks.append( tf.train.LoggingTensorHook(logging_dict, every_n_iter=100)) if num_steps: hooks.append(tf.train.StopAtStepHook(last_step=num_steps)) scaffold = tf.train.Scaffold(saver=tf.train.Saver( max_to_keep=checkpoints_to_keep, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)) tf.contrib.training.train(train_op=train_op, logdir=train_dir, scaffold=scaffold, hooks=hooks, save_checkpoint_secs=60, master=master, is_chief=is_chief)
def __init__(self, s_size, a_size, scope, trainer): with tf.variable_scope(scope): # distribution dqn self.atoms = 21 self.v_max = 10. self.v_min = -10. self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1) self.z = [self.v_min + i * self.delta_z for i in range(self.atoms)] # network self.inputs = tf.placeholder(shape=[None, s_size], dtype=tf.float32) self.imageIn = tf.reshape(self.inputs, shape=[-1, 84, 84, 1]) self.conv1 = slim.conv2d(activation_fn=tf.nn.relu, inputs=self.imageIn, num_outputs=32, kernel_size=[8, 8], stride=[4, 4], padding='VALID') self.conv2 = slim.conv2d(activation_fn=tf.nn.relu, inputs=self.conv1, num_outputs=64, kernel_size=[4, 4], stride=[2, 2], padding='VALID') self.conv3 = slim.conv2d(activation_fn=tf.nn.relu, inputs=self.conv2, num_outputs=64, kernel_size=[3, 3], stride=[1, 1], padding='VALID') hidden = slim.fully_connected(slim.flatten(self.conv3), 512, activation_fn=tf.nn.relu) self.out = slim.fully_connected( hidden, a_size * self.atoms, activation_fn=None, weights_initializer=normalized_columns_initializer(0.1), biases_initializer=None) self.out = tf.reshape(self.out, [-1, a_size, self.atoms]) self.p = tf.nn.softmax(self.out, dim=2) self.Q = tf.reduce_sum(self.z * self.p, axis=2) #Only the worker network need ops for loss functions and gradient updating. if scope != 'global': self.m_input = tf.placeholder(shape=[None, self.atoms], dtype=tf.float32) self.actions_p = tf.placeholder( shape=[None, a_size, self.atoms], dtype=tf.float32) self.p_actiona = tf.multiply(self.p, self.actions_p) self.p_action = tf.reduce_sum(self.p_actiona, axis=1) self.p_alog = -tf.log(self.p_action + 1e-20) + tf.log(self.m_input + 1e-20) self.loss = tf.reduce_mean( tf.reduce_sum(self.m_input * self.p_alog, axis=1)) local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm( self.gradients, 40.0) #Apply local gradients to global network global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients( zip(grads, global_vars))
def __init__(self, vocab_size, batch_size, num_epochs, check_point_step, num_train_samples, num_valid_samples, num_layers, num_hidden_units, max_gradient_norm, initial_learning_rate=1, final_learning_rate=0.001 ): self.vocab_size = vocab_size self.batch_size = batch_size self.num_epochs = num_epochs self.check_point_step = check_point_step self.num_train_samples = num_train_samples self.num_valid_samples = num_valid_samples self.num_layers = num_layers self.num_hidden_units = num_hidden_units self.max_gradient_norm = max_gradient_norm self.global_step = tf.Variable(0, trainable=False) # We set a dynamic learining rate, it decays every time the model has gone through 150 batches. # A minimum learning rate has also been set. self.learning_rate = tf.train.exponential_decay(initial_learning_rate, self.global_step, 150, 0.96, staircase=True) self.learning_rate = tf.cond(tf.less(self.learning_rate, final_learning_rate), lambda: tf.constant(final_learning_rate), lambda: self.learning_rate) self.dropout_rate = tf.placeholder(tf.float32, name="dropout_rate") self.file_name_train = tf.placeholder(tf.string) self.file_name_validation = tf.placeholder(tf.string) self.file_name_test = tf.placeholder(tf.string) def parse(line): line_split = tf.string_split([line]) input_seq = tf.string_to_number(line_split.values[:-1], out_type=tf.int32) output_seq = tf.string_to_number(line_split.values[1:], out_type=tf.int32) return input_seq, output_seq training_dataset = tf.data.TextLineDataset(self.file_name_train).map(parse).shuffle(256).padded_batch(self.batch_size, padded_shapes=([None], [None])) validation_dataset = tf.data.TextLineDataset(self.file_name_validation).map(parse).padded_batch(self.batch_size, padded_shapes=([None], [None])) test_dataset = tf.data.TextLineDataset(self.file_name_test).map(parse).batch(1) iterator = tf.contrib.data.Iterator.from_structure(training_dataset.output_types, training_dataset.output_shapes) self.input_batch, self.output_batch = iterator.get_next() self.trining_init_op = iterator.make_initializer(training_dataset) self.validation_init_op = iterator.make_initializer(validation_dataset) self.test_init_op = iterator.make_initializer(test_dataset) # Input embedding mat self.input_embedding_mat = tf.get_variable("input_embedding_mat", [self.vocab_size, self.num_hidden_units], dtype=tf.float32) self.input_embedded = tf.nn.embedding_lookup(self.input_embedding_mat, self.input_batch) # LSTM cell cell = tf.contrib.rnn.LSTMCell(self.num_hidden_units, state_is_tuple=True) cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=self.dropout_rate) cell = tf.contrib.rnn.MultiRNNCell(cells=[cell]*self.num_layers, state_is_tuple=True) self.cell = cell # Output embedding self.output_embedding_mat = tf.get_variable("output_embedding_mat", [self.vocab_size, self.num_hidden_units], dtype=tf.float32) self.output_embedding_bias = tf.get_variable("output_embedding_bias", [self.vocab_size], dtype=tf.float32) non_zero_weights = tf.sign(self.input_batch) self.valid_words = tf.reduce_sum(non_zero_weights) # Compute sequence length def get_length(non_zero_place): real_length = tf.reduce_sum(non_zero_place, 1) real_length = tf.cast(real_length, tf.int32) return real_length batch_length = get_length(non_zero_weights) # The shape of outputs is [batch_size, max_length, num_hidden_units] outputs, _ = tf.nn.dynamic_rnn( cell=self.cell, inputs=self.input_embedded, sequence_length=batch_length, dtype=tf.float32 ) def output_embedding(current_output): return tf.add( tf.matmul(current_output, tf.transpose(self.output_embedding_mat)), self.output_embedding_bias) # To compute the logits logits = tf.map_fn(output_embedding, outputs) logits = tf.reshape(logits, [-1, vocab_size]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(self.output_batch, [-1]), logits=logits) \ * tf.cast(tf.reshape(non_zero_weights, [-1]), tf.float32) self.loss = loss # Train params = tf.trainable_variables() opt = tf.train.AdagradOptimizer(self.learning_rate) gradients = tf.gradients(self.loss, params, colocate_gradients_with_ops=True) clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm) self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
# finstate = array_ops.concat(1, [le_state, inp_state]) # outputs, finstate = rnn.rnn(neurons_out, outputs_le, finstate, scope="out") # official seq2seq (perfect regression) # outputs, finstate = ss.basic_rnn_seq2seq(inputs, targets, neurons) loss = tf.add_n([ tf.nn.l2_loss(target - output) for output, target in zip(outputs, targets) ]) / bptt_steps / batch_size / net_size lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads_raw = tf.gradients(loss, tvars) grads, _ = tf.clip_by_global_norm(grads_raw, 5.0) # optimizer = tf.train.GradientDescentOptimizer(lr) # optimizer = tf.train.AdagradOptimizer(lr) optimizer = tf.train.AdamOptimizer(lr) # optimizer = tf.train.RMSPropOptimizer(lr) # optimizer = tf.train.AdadeltaOptimizer(lr) train_step = optimizer.apply_gradients(zip(grads, tvars)) train_data = np.load( pj(os.environ["HOME"], "Music", "ml", "test_licks.data.npy")) input_size = train_data.shape[0] corpus = dispatch_array(train_data, bptt_steps, batch_size) sfn = 9
def build(self): self.word_input_ids = tf.placeholder(tf.int32, [None, None], name='word_input') self.tag_input_ids = tf.placeholder(tf.int32, [None, None], name='tag_input') self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob') self.sequence_lengths = tf.placeholder(tf.int32, [None], name='sequence_lengths') self.char_input_ids = tf.placeholder(tf.int32, [None, None, None], name='char_input') self.word_lengths = tf.placeholder(tf.int32, [None, None], name='word_lengths') # word embedding embedded_words = self._word_embedding(self.word_input_ids) # if self.params['dropout']: # embedded_words = tf.nn.dropout(embedded_words, self.dropout_keep_prob) self.batch_size = tf.shape(embedded_words)[0] self.max_sent_len = tf.shape(embedded_words)[1] # char embedding embedded_chars = self._char_embedding(self.char_input_ids) # if self.params['dropout']: # embedded_chars = tf.nn.dropout(embedded_chars, self.dropout_keep_prob) self.max_char_len = tf.shape(embedded_chars)[2] char_output, char_hiddens = self._char_lstm(embedded_chars, self.word_lengths) word_lstm_input = tf.concat([embedded_words, char_output], axis=-1) if self.params['char_attention']: context, self.batch_alphas = self._char_attention_layer( embedded_words, char_hiddens, self.word_lengths) word_lstm_input = tf.concat([word_lstm_input, context], axis=-1) if self.params['dropout']: word_lstm_input = tf.nn.dropout(word_lstm_input, self.dropout_keep_prob) word_bilstm_output = self._word_lstm(word_lstm_input, self.sequence_lengths) self.logits = self._label_prediction(word_bilstm_output) with tf.variable_scope('loss') as vs: if self.params['use_crf_loss']: log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood( self.logits, self.tag_input_ids, self.sequence_lengths) self.word_loss = tf.reduce_mean(-log_likelihood, name='crf_negloglik_loss') # print self.transition_params.name else: # add softmax loss self.pred_tags = tf.cast(tf.argmax(self.logits, axis=-1), tf.int32) losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits, labels=self.tag_input_ids) mask = tf.sequence_mask(self.sequence_lengths) losses = tf.boolean_mask(losses, mask) self.word_loss = tf.reduce_mean(losses) print vs.name, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) self.total_loss = self.word_loss # optimization if self.params['lr_method'].lower() == 'adam': optimizer_total = tf.train.AdamOptimizer(self.params['lr_rate']) optimizer_word = tf.train.AdamOptimizer(self.params['lr_rate']) optimizer_char = tf.train.AdamOptimizer(self.params['lr_rate']) elif self.params['lr_method'].lower() == 'adagrad': optimizer_total = tf.train.AdagradOptimizer(self.params['lr_rate']) optimizer_word = tf.train.AdagradOptimizer(self.params['lr_rate']) optimizer_char = tf.train.AdagradOptimizer(self.params['lr_rate']) elif self.params['lr_method'].lower() == 'adadelta': optimizer_total = tf.train.AdadeltaOptimizer( self.params['lr_rate']) optimizer_word = tf.train.AdadeltaOptimizer(self.params['lr_rate']) optimizer_char = tf.train.AdadeltaOptimizer(self.params['lr_rate']) elif self.params['lr_method'].lower() == 'sgd': optimizer_total = tf.train.GradientDescentOptimizer( self.params['lr_rate']) optimizer_word = tf.train.GradientDescentOptimizer( self.params['lr_rate']) optimizer_char = tf.train.GradientDescentOptimizer( self.params['lr_rate']) elif self.params['lr_method'].lower() == 'rmsprop': optimizer_total = tf.train.RMSPropOptimizer(self.params['lr_rate']) optimizer_word = tf.train.RMSPropOptimizer(self.params['lr_rate']) optimizer_char = tf.train.RMSPropOptimizer(self.params['lr_rate']) elif self.params['lr_method'].lower() == 'momentum': optimizer_total = tf.train.MomentumOptimizer( self.params['lr_rate'], self.params['momentum']) optimizer_word = tf.train.MomentumOptimizer( self.params['lr_rate'], self.params['momentum']) optimizer_char = tf.train.MomentumOptimizer( self.params['lr_rate'], self.params['momentum']) if self.params['clip_norm'] > 0: grads, vs = zip( *optimizer_total.compute_gradients(self.total_loss)) grads, gnorm = tf.clip_by_global_norm(grads, self.params['clip_norm']) self.total_train_op = optimizer_total.apply_gradients( zip(grads, vs)) grads, vs = zip(*optimizer_word.compute_gradients(self.word_loss)) grads, gnorm = tf.clip_by_global_norm(grads, self.params['clip_norm']) self.word_train_op = optimizer_word.apply_gradients(zip(grads, vs)) else: self.total_train_op = optimizer_total.minimize(self.total_loss) self.word_train_op = optimizer_word.minimize(self.word_loss) return
def train(): """ 模型训练 :return: """ char2id, ner2id, pos2id = load_dict(char_dict="train_data_4/char2id.json", ner_dict="train_data_4/ner2id.json", pos_dict="train_data_4/pos2id.json") # tf.flags.DEFINE_string("data_dir", "data/data.dat", "data directory") tf.flags.DEFINE_integer("vocab_size_c", len(char2id), "vocabulary size") tf.flags.DEFINE_integer("vocab_size_p", len(pos2id), "vocabulary size") tf.flags.DEFINE_integer("num_classes", len(ner2id), "number of classes") tf.flags.DEFINE_integer("max_num", 384, "max_sentence_num") tf.flags.DEFINE_integer( "embedding_size_c", 256, "Dimensionality of character embedding (default: 200)") tf.flags.DEFINE_integer( "embedding_size_p", 256, "Dimensionality of character embedding (default: 200)") tf.flags.DEFINE_integer( "hidden_size", 128, "Dimensionality of GRU hidden layer (default: 50)") tf.flags.DEFINE_integer("batch_size", 256, "Batch Size (default: 64)") tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 50)") tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") tf.flags.DEFINE_integer("num_checkpoints", 3, "Number of checkpoints to store (default: 5)") tf.flags.DEFINE_integer("evaluate_every", 300, "evaluate every this many batches") tf.flags.DEFINE_float("learning_rate", 0.01, "learning rate") tf.flags.DEFINE_float("grad_clip", 5, "grad clip to prevent gradient explode") FLAGS = tf.flags.FLAGS with tf.Session(config=config) as sess: ner = NER(vocab_size_c=FLAGS.vocab_size_c, vocab_size_p=FLAGS.vocab_size_p, num_classes=FLAGS.num_classes, embedding_size_c=FLAGS.embedding_size_c, embedding_size_p=FLAGS.embedding_size_p, hidden_size=FLAGS.hidden_size, max_num=FLAGS.max_num) # 外部定义 优化器 global_step = tf.Variable(0, trainable=False) optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) # RNN中常用的梯度截断,防止出现梯度过大难以求导的现象 tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(ner.loss, tvars), FLAGS.grad_clip) grads_and_vars = tuple(zip(grads, tvars)) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) if not os.path.exists('./ckpt_3/'): os.makedirs("./ckpt_3/") # 恢复模型 / 重新初始化参数 # model_file = tf.train.latest_checkpoint('./ckpt/') ckpt = tf.train.get_checkpoint_state('./ckpt_3/') if ckpt: print("load saved model:\t", ckpt.model_checkpoint_path) saver = tf.train.Saver() saver.restore(sess, ckpt.model_checkpoint_path) else: print("init model...") sess.run(tf.global_variables_initializer()) def extract(p): # 封装一下,输出结果 IOS = [] index = 0 start = None for i in p: if i == 0: if start is None: pass else: IOS.append((start, index)) break elif i == 1: if start is None: pass else: if index > 0: IOS.append((start, index)) start = None else: # 包含实体 if start is None: start = index else: if i == p[index - 1]: pass else: IOS.append((start, index)) start = index index += 1 return IOS def evaluate(viterbi_sequence, Y): ''' 计算变长的 准确率 指标 :return: ''' TP = 0 P_ = 0 R_ = 0 for p, y in zip(viterbi_sequence, Y): # 当前句子的长度 pre_ = extract(p) tru_ = extract(y) # 计算 acc comm = [i for i in pre_ if i in tru_] TP += len(comm) P_ += len(pre_) R_ += len(tru_) # l = len(np.nonzero(y)) # # 通过两个序列,计算准确率 # t_all += l # t_true += np.sum(np.equal(p[:l], y[:l])) return TP, P_, R_ def train_step(x, pos, y): feed_dict = { ner.input_chars: x, ner.input_pos: pos, ner.output: y, ner.is_training: True, } _, step, predicts_t, cost, accuracy = sess.run([ train_op, global_step, ner.viterbi_sequence, ner.loss, ner.acc ], feed_dict) tp, p_, r_ = evaluate(np.array(predicts_t), y) time_str = str(int(time.time())) p = float(tp) / p_ if p_ else 0 r = float(tp) / r_ if r_ else 0 if p + r: f = 2 * p * r / (p + r) else: f = 0 print("{}: step {}, loss {}, p {}, r {}, f {}".format( time_str, step, cost, p, r, f)) # train_summary_writer.add_summary(summaries, step) return step def dev_step(x, pos, y, writer=None): feed_dict = { ner.input_chars: x, ner.input_pos: pos, ner.output: y, ner.is_training: False, } step, predicts_d, cost, accuracy = sess.run( [global_step, ner.viterbi_sequence, ner.loss, ner.acc], feed_dict) tp, p_, r_ = evaluate(np.array(predicts_d), y) time_str = str(int(time.time())) p = float(tp) / p_ if p_ else 0 r = float(tp) / r_ if r_ else 0 if p + r: f = 2 * p * r / (p + r) else: f = 0 print("+dev+{}: step {}, loss {}, p {}, r {}, f {}".format( time_str, step, cost, p, r, f)) # time_str = str(int(time.time())) # print("+dev+{}: step {}, loss {}, f_acc {}, t_acc {}".format(time_str, step, cost, accuracy, acc_d)) return cost, tp, p_, r_ best_accuracy, best_at_step = 0, 0 train_example_len = 173109 dev_example_len = 21639 num_train_steps = int(train_example_len / FLAGS.batch_size * FLAGS.num_epochs) num_dev_steps = int(dev_example_len / FLAGS.batch_size) min_loss = 99999 input_ids_train, input_pos_train, output_types_train = get_input_data( "./train_data_4/train_ner.tf_record", FLAGS.batch_size) input_ids_dev, input_pos_dev, output_types_dev = get_input_data( "./train_data_4/dev_ner.tf_record", FLAGS.batch_size) for i in range(num_train_steps): # batch 数据 input_ids_train_, input_pos_train_, output_types_train_ = sess.run( [input_ids_train, input_pos_train, output_types_train]) step = train_step(input_ids_train_, input_pos_train_, output_types_train_) if step % FLAGS.evaluate_every == 0: # dev 数据过大, 也需要进行 分批 TP = 0 P_ = 0 R_ = 0 total_loss = 0 for j in range(num_dev_steps): input_ids_dev_, input_pos_dev_, output_types_dev_ = sess.run( [input_ids_dev, input_pos_dev, output_types_dev]) loss, tp, p_, r_ = dev_step(input_ids_dev_, input_pos_dev_, output_types_dev_) TP += tp P_ += p_ R_ += r_ total_loss += loss # total_dev_correct += count # total_devs += total p = float(TP) / P_ if P_ else 0 r = float(TP) / R_ if R_ else 0 f = 2 * p * r / (p + r) if p + r else 0 print("tp:p", TP, p) print("p_:r", P_, r) print("r_:f", R_, f) if total_loss < min_loss: print("save model:\t%f\t>%f\t%f\t>%f" % (total_loss, p, r, f)) min_loss = total_loss saver.save(sess, './ckpt_3/ner.ckpt', global_step=step) sess.close()
def train(): num_classes = get_num_classes(FLAGS.train_set) model = Very_deep_cnn(batch_size=FLAGS.batch_size, num_classes=num_classes, depth=FLAGS.depth, num_embedding=len(FLAGS.alphabet)) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = True training_set, num_training_iters = create_dataset( FLAGS.train_set, FLAGS.alphabet, FLAGS.max_length, FLAGS.batch_size, True) test_set, num_test_iters = create_dataset(FLAGS.test_set, FLAGS.alphabet, FLAGS.max_length, FLAGS.batch_size, False) train_iterator = training_set.make_initializable_iterator() test_iterator = test_set.make_initializable_iterator() handle = tf.placeholder(tf.string, shape=[]) is_training = tf.placeholder(tf.bool, name='is_training') iterator = tf.data.Iterator.from_string_handle( handle, training_set.output_types, training_set.output_shapes) texts, labels = iterator.get_next() logits = model.forward(texts, is_training) loss = model.loss(logits, labels) loss_summary = tf.summary.scalar("loss", loss) accuracy = model.accuracy(logits, labels) accuracy_sumary = tf.summary.scalar("accuracy", accuracy) batch_size = tf.unstack(tf.shape(texts))[0] confusion = model.confusion_matrix(logits, labels) global_step = tf.Variable(0, name="global_step", trainable=False) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): lr = tf.train.exponential_decay(FLAGS.lr, global_step, FLAGS.num_epochs * num_training_iters, 0.96, staircase=True) optimizer = tf.train.MomentumOptimizer(lr, FLAGS.momentum) gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=global_step) merged = tf.summary.merge([loss_summary, accuracy_sumary]) init = tf.global_variables_initializer() saver = tf.train.Saver() if os.path.isdir(FLAGS.log_path): shutil.rmtree(FLAGS.log_path) os.makedirs(FLAGS.log_path) #if os.path.isdir(FLAGS.saved_path): #shutil.rmtree(FLAGS.saved_path) os.makedirs(FLAGS.saved_path) output_file = open(FLAGS.saved_path + os.sep + "logs.txt", "w") output_file.write("Model's parameters: {}".format( FLAGS.flag_values_dict())) best_loss = 1e5 best_epoch = 0 with tf.Session(config=session_conf) as sess: train_writer = tf.summary.FileWriter( FLAGS.log_path + os.sep + 'train', sess.graph) test_writer = tf.summary.FileWriter(FLAGS.log_path + os.sep + 'test') sess.run(init) for epoch in range(FLAGS.num_epochs): sess.run(train_iterator.initializer) sess.run(test_iterator.initializer) train_handle = sess.run(train_iterator.string_handle()) test_handle = sess.run(test_iterator.string_handle()) train_iter = 0 while True: try: _, tr_loss, tr_accuracy, summary, step = sess.run( [train_op, loss, accuracy, merged, global_step], feed_dict={ handle: train_handle, is_training: True }) print( "Epoch: {}/{}, Iteration: {}/{}, Loss: {}, Accuracy: {}" .format(epoch + 1, FLAGS.num_epochs, train_iter + 1, num_training_iters, tr_loss, tr_accuracy)) train_writer.add_summary(summary, step) train_iter += 1 except (tf.errors.OutOfRangeError, StopIteration): break if epoch % FLAGS.test_interval == 0: loss_ls = [] loss_summary = tf.Summary() accuracy_ls = [] accuracy_summary = tf.Summary() confusion_matrix = np.zeros([num_classes, num_classes], np.int32) num_samples = 0 while True: try: test_loss, test_accuracy, test_confusion, samples = sess.run( [loss, accuracy, confusion, batch_size], feed_dict={ handle: test_handle, is_training: False }) loss_ls.append(test_loss * samples) accuracy_ls.append(test_accuracy * samples) confusion_matrix += test_confusion num_samples += samples except (tf.errors.OutOfRangeError, StopIteration): break mean_test_loss = sum(loss_ls) / num_samples loss_summary.value.add(tag='loss', simple_value=mean_test_loss) test_writer.add_summary(loss_summary, epoch) mean_test_accuracy = sum(accuracy_ls) / num_samples accuracy_summary.value.add(tag='accuracy', simple_value=mean_test_accuracy) test_writer.add_summary(accuracy_summary, epoch) output_file.write( "Epoch: {}/{} \nTest loss: {} Test accuracy: {} \nTest confusion matrix: \n{}\n\n" .format(epoch + 1, FLAGS.num_epochs, mean_test_loss, mean_test_accuracy, confusion_matrix)) print("Epoch: {}/{}, Final loss: {}, Final accuracy: {}". format(epoch + 1, FLAGS.num_epochs, mean_test_loss, mean_test_accuracy)) if mean_test_loss + FLAGS.es_min_delta < best_loss: best_loss = mean_test_loss best_epoch = epoch saver.save( sess, "{}/char_level_cnn".format(FLAGS.saved_path)) if epoch - best_epoch > FLAGS.es_patience > 0: print( "Stop training at epoch {}. The lowest loss achieved is {} at epoch {}" .format(epoch, best_loss, best_epoch)) break output_file.close()
def optimize(loss, global_step, max_grad_norm, lr, lr_decay, sync_replicas=False, replicas_to_aggregate=1, task_id=0): """Builds optimization graph. * Creates an optimizer, and optionally wraps with SyncReplicasOptimizer * Computes, clips, and applies gradients * Maintains moving averages for all trainable variables * Summarizes variables and gradients Args: loss: scalar loss to minimize. global_step: integer scalar Variable. max_grad_norm: float scalar. Grads will be clipped to this value. lr: float scalar, learning rate. lr_decay: float scalar, learning rate decay rate. sync_replicas: bool, whether to use SyncReplicasOptimizer. replicas_to_aggregate: int, number of replicas to aggregate when using SyncReplicasOptimizer. task_id: int, id of the current task; used to ensure proper initialization of SyncReplicasOptimizer. Returns: train_op """ with tf.name_scope('optimization'): # Compute gradients. tvars = tf.trainable_variables() grads = tf.gradients( loss, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) # Clip non-embedding grads non_embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars) if 'embedding' not in v.op.name] embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars) if 'embedding' in v.op.name] ne_grads, ne_vars = list(zip(*non_embedding_grads_and_vars)) ne_grads, _ = tf.clip_by_global_norm(ne_grads, max_grad_norm) non_embedding_grads_and_vars = list(zip(ne_grads, ne_vars)) grads_and_vars = embedding_grads_and_vars + non_embedding_grads_and_vars # Summarize _summarize_vars_and_grads(grads_and_vars) # Decaying learning rate lr = tf.train.exponential_decay(lr, global_step, 1, lr_decay, staircase=True) tf.summary.scalar('learning_rate', lr) opt = tf.train.AdamOptimizer(lr) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( 0.999, global_step) # Apply gradients if sync_replicas: opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=tvars, total_num_replicas=replicas_to_aggregate) apply_gradient_op = opt.apply_gradients(grads_and_vars, global_step=global_step) with tf.control_dependencies([apply_gradient_op]): train_op = tf.no_op(name='train_op') # Initialization ops tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, opt.get_chief_queue_runner()) if task_id == 0: # Chief task local_init_op = opt.chief_init_op tf.add_to_collection('chief_init_op', opt.get_init_tokens_op()) else: local_init_op = opt.local_step_init_op tf.add_to_collection('local_init_op', local_init_op) tf.add_to_collection('ready_for_local_init_op', opt.ready_for_local_init_op) else: # Non-sync optimizer variables_averages_op = variable_averages.apply(tvars) apply_gradient_op = opt.apply_gradients(grads_and_vars, global_step) with tf.control_dependencies( [apply_gradient_op, variables_averages_op]): train_op = tf.no_op(name='train_op') return train_op
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn.BasicRNNCell elif args.model == 'gru': cell_fn = rnn.GRUCell elif args.model == 'lstm': cell_fn = rnn.BasicLSTMCell elif args.model == 'nas': cell_fn = rnn.NASCell else: raise Exception("model type not supported: {}".format(args.model)) with tf.device(args.device): cells = [] for _ in range(args.num_layers): cell = cell_fn(args.rnn_size) if not infer and (args.output_keep_prob < 1.0 or args.input_keep_prob < 1.0): cell = rnn.DropoutWrapper( cell, input_keep_prob=args.input_keep_prob, output_keep_prob=args.output_keep_prob) cells.append(cell) self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True) self.input_data = tf.placeholder( tf.int32, [args.batch_size, args.seq_length], name='input') self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name='target') self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) if not infer and args.output_keep_prob: inputs = tf.nn.dropout(inputs, args.output_keep_prob) inputs = tf.split(inputs, args.seq_length, 1) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = legacy_seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = legacy_seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.word_len = tf.placeholder(tf.int32, shape=[args.batch_size], name='word_lengths') mask = tf.sequence_mask(self.word_len, args.seq_length, dtype=tf.float32) mask = tf.reshape(mask, [-1]) loss = tf.multiply(mask, loss) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) with tf.name_scope('optimizer'): optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) # instrument tensorboard tf.summary.histogram('logits', self.logits) tf.summary.histogram('loss', loss) tf.summary.scalar('train_loss', self.cost)
def __init__(self, is_training, config, ptb_input): self._input = ptb_input self._is_training = is_training batch_size = ptb_input.batch_size num_steps = ptb_input.num_steps # 反向传播的展开步数(状态数) hidden_size = config.hidden_size # LSTMCell的节点数(隐层列个数) vocab_size = config.vocab_size # 词汇表大小(输出层列个数) def lstm_cell(): """返回一个LSTMcell,每个cell是一个单隐层的网络""" return rnn.BasicLSTMCell(hidden_size, forget_bias=0.0, reuse=tf.get_variable_scope().reuse) attn_cell = lstm_cell if is_training and config.keep_prob < 1: def attn_cell(): """若需要dropout则返回一个经过dropout的cell""" return rnn.DropoutWrapper(lstm_cell(), output_keep_prob=config.keep_prob) cell = rnn.MultiRNNCell([attn_cell() for _ in range(config.num_layers)]) """用num_layers个LSTMCell堆叠成一个cell,即一个cell中,第一个LSTMCell的输出变成下一个LSTMCell的输入""" # 初始状态 self._initial_state = cell.zero_state(batch_size, tf.float32) """state是个tuple,大小为num_layers""" # 输入 with tf.device('/cpu:0'): embedding = tf.get_variable('embedding', (vocab_size, hidden_size), tf.float32) inputs = tf.nn.embedding_lookup(embedding, ptb_input.input_data) """inputs[batch_size, num_steps, hidden_size],其中第二个维度在vocab_size中取值 num_steps个cell的输入,每个cell的inputs是 [batch, hidden_size] """ if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) # 隐层输出 outputs = list() state = self._initial_state # 细胞状态 with tf.variable_scope('RNN'): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() cell_output, state = cell(inputs[:, time_step, :], state) outputs.append(cell_output) """outputs[num_steps, batch_size, hidden_size]""" outputs_flat = tf.reshape(tf.concat(outputs, 1), (-1, hidden_size)) """outputs_flat:[y1, y2, y3, y1, y2, y3, ...].T""" # 输出层 softmax_w = tf.get_variable('softmax_w', (hidden_size, vocab_size), tf.float32) softmax_b = tf.get_variable('sotfmax_b', [vocab_size], tf.float32) logits = tf.nn.bias_add(tf.matmul(outputs_flat, softmax_w), softmax_b) """logits[num_steps * batch_size, vocab_size]""" loss = legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(ptb_input.targets, [-1])], [tf.ones([batch_size * num_steps])]) """对每个logit,target对分别计算loss然后对这些loss进行加权求和""" self._cost = tf.reduce_sum(loss) / batch_size self._final_state = state tf.summary.histogram('softmax_w', softmax_w) tf.summary.histogram('softmax_b', softmax_b) tf.summary.scalar('cost', self._cost) if not is_training: return # 优化 self._lr = tf.Variable(0.0, trainable=False) trainable_var = tf.trainable_variables() # 获取所有可训练的变量 grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_var), config.max_grad_norm) # 梯度截断 optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, trainable_var), global_step=framework.get_or_create_global_step()) self.new_lr = tf.placeholder(tf.float32, [], name='new_learing_rate') self.lr_update = tf.assign(self.lr, self.new_lr) tf.summary.scalar('lr', self._lr) self._merge = tf.summary.merge_all()
def __init__(self, source_vocab_size, target_vocab_size, buckets, state_size, num_layers, embedding_size, max_gradient, batch_size, learning_rate, forward_only=False, dtype=tf.float32): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = learning_rate self.global_step = tf.Variable(0, trainable=False, name="global_step") self.state_size = state_size self.encoder_inputs = tf.placeholder(tf.int32, shape=[self.batch_size, None]) self.decoder_inputs = tf.placeholder(tf.int32, shape=[self.batch_size, None]) self.decoder_targets = tf.placeholder(tf.int32, shape=[self.batch_size, None]) self.encoder_len = tf.placeholder(tf.int32, shape=[self.batch_size]) self.decoder_len = tf.placeholder(tf.int32, shape=[self.batch_size]) self.beam_tok = tf.placeholder(tf.int32, shape=[self.batch_size]) self.prev_att = tf.placeholder(tf.float32, shape=[self.batch_size, state_size * 2]) encoder_fw_cell = tf.contrib.rnn.GRUCell(state_size) encoder_bw_cell = tf.contrib.rnn.GRUCell(state_size) decoder_cell = tf.contrib.rnn.GRUCell(state_size) if not forward_only: encoder_fw_cell = tf.contrib.rnn.DropoutWrapper( encoder_fw_cell, output_keep_prob=0.50) encoder_bw_cell = tf.contrib.rnn.DropoutWrapper( encoder_bw_cell, output_keep_prob=0.50) decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell, output_keep_prob=0.50) with tf.variable_scope("seq2seq", dtype=dtype): with tf.variable_scope("encoder"): encoder_emb = tf.get_variable( "embedding", [source_vocab_size, embedding_size], initializer=emb_init) encoder_inputs_emb = tf.nn.embedding_lookup( encoder_emb, self.encoder_inputs) encoder_outputs, encoder_states = \ tf.nn.bidirectional_dynamic_rnn( encoder_fw_cell, encoder_bw_cell, encoder_inputs_emb, sequence_length=self.encoder_len, dtype=dtype) with tf.variable_scope("init_state"): init_state = fc_layer(tf.concat(encoder_states, 1), state_size) # the shape of bidirectional_dynamic_rnn is weird # None for batch_size self.init_state = init_state self.init_state.set_shape([self.batch_size, state_size]) self.att_states = tf.concat(encoder_outputs, 2) self.att_states.set_shape( [self.batch_size, None, state_size * 2]) with tf.variable_scope("attention"): attention = tf.contrib.seq2seq.BahdanauAttention( state_size, self.att_states, self.encoder_len) decoder_cell = tf.contrib.seq2seq.DynamicAttentionWrapper( decoder_cell, attention, state_size * 2) wrapper_state = tf.contrib.seq2seq.DynamicAttentionWrapperState( self.init_state, self.prev_att) with tf.variable_scope("decoder") as scope: decoder_emb = tf.get_variable( "embedding", [target_vocab_size, embedding_size], initializer=emb_init) decoder_cell = tf.contrib.rnn.OutputProjectionWrapper( decoder_cell, target_vocab_size) if not forward_only: decoder_inputs_emb = tf.nn.embedding_lookup( decoder_emb, self.decoder_inputs) helper = tf.contrib.seq2seq.TrainingHelper( decoder_inputs_emb, self.decoder_len) decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, helper, wrapper_state) outputs, final_state = \ tf.contrib.seq2seq.dynamic_decode(decoder) outputs_logits = outputs[0] self.outputs = outputs_logits weights = tf.sequence_mask(self.decoder_len, dtype=tf.float32) loss_t = tf.contrib.seq2seq.sequence_loss( outputs_logits, self.decoder_targets, weights, average_across_timesteps=False, average_across_batch=False) self.loss = tf.reduce_sum(loss_t) / self.batch_size params = tf.trainable_variables() opt = tf.train.AdadeltaOptimizer(self.learning_rate, epsilon=1e-6) gradients = tf.gradients(self.loss, params) clipped_gradients, norm = \ tf.clip_by_global_norm(gradients, max_gradient) self.updates = opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) tf.summary.scalar('loss', self.loss) else: self.loss = tf.constant(0) with tf.variable_scope("proj") as scope: output_fn = lambda x: fc_layer( x, target_vocab_size, scope=scope) st_toks = tf.convert_to_tensor([data_util.ID_GO] * batch_size, dtype=tf.int32) helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( decoder_emb, st_toks, data_util.ID_EOS) decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, helper, wrapper_state) outputs, final_state = \ tf.contrib.seq2seq.dynamic_decode(decoder) self.outputs = outputs[0] # single step decode for beam search with tf.variable_scope("decoder", reuse=True): beam_emb = tf.nn.embedding_lookup( decoder_emb, self.beam_tok) self.beam_outputs, self.beam_nxt_state, _, _ = \ decoder.step(0, beam_emb, wrapper_state) self.beam_logsoftmax = \ tf.nn.log_softmax(self.beam_outputs[0]) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=0) self.summary_merge = tf.summary.merge_all()
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False, scope_name='seq2seq', dtype=tf.float32): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. dtype: the data type to use to store internal variables. """ self.scope_name = scope_name with tf.variable_scope(self.scope_name): self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.dummy_dialogs = [] # [TODO] load dummy sentences # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_samples, num_classes=self.target_vocab_size), dtype) softmax_loss_function = sampled_loss # cells=[] # for _ in range(num_layers): # cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(size)) # cells.append(cell) # cell = tf.contrib.rnn.MultiRNNCell(cells) # Create the internal multi-layer cell for our RNN. def single_cell(): return tf.contrib.rnn.GRUCell(size) if use_lstm: import pdb pdb.set_trace() def single_cell(): return tf.contrib.rnn.BasicLSTMCell(size) cell = single_cell() if num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell( [single_cell() for _ in range(num_layers)]) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, feed_previous): return tf_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=feed_previous, # do_decode, dtype=dtype) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # for reinforcement learning # self.force_dec_input = tf.placeholder(tf.bool, name="force_dec_input") # self.en_output_proj = tf.placeholder(tf.bool, name="en_output_proj") # Training outputs and losses. if forward_only: self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses, self.encoder_state = tf_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() # if not forward_only: self.gradient_norms = [] self.updates = [] self.advantage = [ tf.placeholder(tf.float32, name="advantage_%i" % i) for i in xrange(len(buckets)) ] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): # self.losses[b] = tf.subtract(self.losses[b], self.advantage[b]) gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) all_variables = tf.global_variables() all_variables = [ k for k in tf.global_variables() if k.name.startswith(self.scope_name) ] self.saver = tf.train.Saver(all_variables)
def __init__(self, sess, config, data_feed, log_dir): vocab_size = len(data_feed.vocab) self.data_feed = data_feed with tf.name_scope("io"): self.inputs = tf.placeholder(dtype=tf.int32, shape=(None, None), name="input_seq") self.input_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="seq_len") self.da_labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name="dialog_acts") self.senti_labels = tf.placeholder( dtype=tf.float32, shape=(None, data_feed.feature_size[data_feed.SENTI_ID]), name="sentiments") self.learning_rate = tf.Variable(float(config.init_lr), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * config.lr_decay) max_sent_len = array_ops.shape(self.inputs)[1] batch_size = array_ops.shape(self.inputs)[0] with variable_scope.variable_scope("word-embedding"): embedding = tf.get_variable("embedding", [vocab_size, config.embed_size], dtype=tf.float32) input_embedding = embedding_ops.embedding_lookup( embedding, tf.squeeze(tf.reshape(self.inputs, [-1, 1]), squeeze_dims=[1])) input_embedding = tf.reshape(input_embedding, [-1, max_sent_len, config.embed_size]) with variable_scope.variable_scope("rnn"): if config.cell_type == "gru": cell = rnn_cell.GRUCell(config.cell_size) elif config.cell_type == "lstm": cell = rnn_cell.LSTMCell(config.cell_size, use_peepholes=False, forget_bias=1.0) elif config.cell_type == "rnn": cell = rnn_cell.BasicRNNCell(config.cell_size) else: raise ValueError("unknown RNN type") if config.keep_prob < 1.0: cell = rnn_cell.DropoutWrapper( cell, output_keep_prob=config.keep_prob, input_keep_prob=config.keep_prob) if config.num_layer > 1: cell = rnn_cell.MultiRNNCell([cell] * config.num_layer, state_is_tuple=True) # and enc_last_state will be same as the true last state outputs, _ = tf.nn.dynamic_rnn( cell, input_embedding, dtype=tf.float32, sequence_length=self.input_lens, ) # get the TRUE last outputs last_outputs = tf.reduce_sum( tf.mul( outputs, tf.expand_dims( tf.one_hot(self.input_lens - 1, max_sent_len), -1)), 1) self.dialog_acts = self.fnn( last_outputs, data_feed.feature_size[data_feed.DA_ID], [100], "dialog_act_fnn") self.sentiments = self.fnn( last_outputs, data_feed.feature_size[data_feed.SENTI_ID], [100], "setiment_fnn") self.loss = tf.reduce_sum(nn_ops.sparse_softmax_cross_entropy_with_logits(self.dialog_acts, self.da_labels)) \ + tf.reduce_sum(nn_ops.softmax_cross_entropy_with_logits(self.sentiments, self.senti_labels)) self.loss /= tf.to_float(batch_size) tf.scalar_summary("entropy_loss", self.loss) self.summary_op = tf.merge_all_summaries() # weight decay tvars = tf.trainable_variables() for v in tvars: print("Trainable %s" % v.name) # optimization if config.op == "adam": print("Use Adam") optimizer = tf.train.AdamOptimizer(self.learning_rate) elif config.op == "rmsprop": print("Use RMSProp") optimizer = tf.train.RMSPropOptimizer(self.learning_rate) else: print("Use SGD") optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.grad_clip) self.train_ops = optimizer.apply_gradients(zip(grads, tvars)) self.saver = tf.train.Saver(tf.all_variables(), write_version=tf.train.SaverDef.V2) if log_dir is not None: train_log_dir = os.path.join(log_dir, "train") print("Save summary to %s" % log_dir) self.train_summary_writer = tf.train.SummaryWriter( train_log_dir, sess.graph)