def after_apply(self): self._moving_averager = tf.train.ExponentialMovingAverage(decay=self._beta, zero_debias=self._zero_debias) assert self._grads != None and len(self._grads) > 0 after_apply_ops = [] # get per var g**2 and norm**2 self._grad_squared = [] self._grad_norm_squared = [] for v, g in zip(self._tvars, self._grads): with ops.colocate_with(v): self._grad_squared.append(tf.square(g) ) self._grad_norm_squared = [tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared] # the following running average on squared norm of gradient is shared by grad_var and dist_to_opt avg_op = self._moving_averager.apply(self._grad_norm_squared) with tf.control_dependencies([avg_op] ): self._grad_norm_squared_avg = [self._moving_averager.average(val) for val in self._grad_norm_squared] self._grad_norm_squared = tf.add_n(self._grad_norm_squared) self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg) after_apply_ops.append(avg_op) with tf.control_dependencies([avg_op] ): curv_range_ops = self.curvature_range() after_apply_ops += curv_range_ops grad_var_ops = self.grad_variance() after_apply_ops += grad_var_ops dist_to_opt_ops = self.dist_to_opt() after_apply_ops += dist_to_opt_ops return tf.group(*after_apply_ops)
def testTensorArrayReadTwice(self): with self.test_session(use_gpu=self._use_gpu): value = tf.constant([[1.0, -1.0], [10.0, -10.0]]) ta_readonce = tensor_array_ops.TensorArray( dtype=tf.float32, tensor_array_name="foo", size=2) w_readonce = ta_readonce.unpack(value) r0_readonce = w_readonce.read(0) with tf.control_dependencies([r0_readonce]): r1_readonce = w_readonce.read(0) with self.assertRaisesOpError( r"Could not read index 0 twice because it was cleared after a " r"previous read \(perhaps try setting clear_after_read = false\?\)"): r1_readonce.eval() ta_readtwice = tensor_array_ops.TensorArray( dtype=tf.float32, tensor_array_name="foo", size=2, clear_after_read=False) w_readtwice = ta_readtwice.unpack(value) r0_readtwice = w_readtwice.read(0) with tf.control_dependencies([r0_readtwice]): r1_readtwice = w_readtwice.read(0) self.assertAllEqual([1.0, -1.0], r1_readtwice.eval())
def moving_average(value, window): value = tf.to_float(value) shape = value.get_shape() queue_init = tf.zeros(tf.TensorShape(window).concatenate(shape)) total_init = tf.zeros(shape) num_init = tf.constant(0, dtype=tf.float32) queue = tf.FIFOQueue(window, [tf.float32], shapes=[shape]) total = tf.Variable(total_init, trainable=False) num = tf.Variable(num_init, trainable=False) init = tf.cond( tf.equal(queue.size(), 0), lambda: tf.group( queue.enqueue_many(queue_init), total.assign(total_init), num.assign(num_init)), lambda: tf.no_op()) with tf.control_dependencies([init]): total_ = total + value - queue.dequeue() num_ = num + 1 value_averaged = total_ / (tf.minimum(num_, window) + EPSILON) with tf.control_dependencies([queue.enqueue([value]), total.assign(total_), num.assign(num_)]): return tf.identity(value_averaged)
def update_parameters(self, loss): if self.regularization_constant != 0: l2_norm = tf.reduce_sum([tf.sqrt(tf.reduce_sum(tf.square(param))) for param in tf.trainable_variables()]) loss = loss + self.regularization_constant*l2_norm optimizer = self.get_optimizer(self.learning_rate_var, self.beta1_decay_var) grads = optimizer.compute_gradients(loss) clipped = [(tf.clip_by_value(g, -self.grad_clip, self.grad_clip), v_) for g, v_ in grads] update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): step = optimizer.apply_gradients(clipped, global_step=self.global_step) if self.enable_parameter_averaging: maintain_averages_op = self.ema.apply(tf.trainable_variables()) with tf.control_dependencies([step]): self.step = tf.group(maintain_averages_op) else: self.step = step logging.info('all parameters:') logging.info(pp.pformat([(var.name, shape(var)) for var in tf.global_variables()])) logging.info('trainable parameters:') logging.info(pp.pformat([(var.name, shape(var)) for var in tf.trainable_variables()])) logging.info('trainable parameter count:') logging.info(str(np.sum(np.prod(shape(var)) for var in tf.trainable_variables())))
def get_run_op(): # Create an optimizer that performs gradient descent. #opt = tf.train.GradientDescentOptimizer(learning_rate=0.01) slice_size = FLAGS.batch_size / FLAGS.num_cuts print('Slice size:{}'.format(slice_size)) data = None label = None last_fc = [tf.no_op()] with tf.device('/gpu:0'): data = tf.get_variable( name = 'data', shape=[slice_size, FLAGS.hidden_size], trainable=False) ''' label = tf.get_variable( name = 'label', shape = [slice_size, FLAGS.hidden_size], trainable=False)) with tf.variable_scope('fc_in'): weight_in = tf.zeros([1000, FLAGS.hidden_size]) for k in xrange(FLAGS.num_cuts): with tf.control_dependencies([last_fc[-1]]): last_fc.append(tf.matmul(data[k+1], weight_in)) ''' for i in xrange(FLAGS.num_cuts): last_fc.append(data) for i in xrange(FLAGS.num_layers): dev = '/gpu:%d' % (i * FLAGS.num_gpus / FLAGS.num_layers) with tf.device(dev), scopes.arg_scope([variables.variable], device=dev): tmp_fc = [tf.no_op()] with tf.variable_scope('fc%d' % i): w = tf.get_variable( name='w', shape=[FLAGS.hidden_size, FLAGS.hidden_size], trainable=True) for k in xrange(FLAGS.num_cuts): with tf.control_dependencies([tmp_fc[-1]]): tmp_fc.append(tf.matmul(last_fc[k+1], w)) last_fc = tmp_fc if i == FLAGS.num_layers - 1: with tf.control_dependencies(last_fc): train_op = tf.no_op() ''' with tf.device('/gpu:%d' % (FLAGS.num_gpus - 1)): tmp_fc = [tf.no_op()] with tf.variable_scope('fc_out'): weight_out = tf.zeros([FLAGS.hidden_size, 1000]) for k in xrange(FLAGS.num_cuts): with tf.control_dependencies([tmp_fc[-1]]): tmp_fc.append(tf.matmul(last_fc[k+1], weight_out)) last_fc = tmp_fc loss = tf.nn_softmax_cross_entropy_with_logits(last_fc, labels, name='xentropy') grads = opt.compute_gradients(loss) apply_gradient_op = opt.apply_gradients(grads) train_op = tf.group(apply_gradient_op) ''' init_op = tf.initialize_all_variables() return init_op, train_op
def train(total_loss, global_step): num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) tf.scalar_summary("learning_rate", lr) loss_averages_op = _add_loss_summaries(total_loss) with tf.control_dependencies([loss_averages_op]): opt = tf.train.GradientDescentOptimizer(lr) grads = opt.compute_gradients(total_loss) apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) for var in tf.trainable_variables(): tf.histogram_summary(var.op.name, var) for grad, var in grads: if grad: tf.histogram_summary(var.op.name + "/gradients", grad) #variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) #variables_averages_op = variable_averages.apply(tf.trainable_variables()) with tf.control_dependencies([apply_gradient_op]): train_op = tf.no_op(name="train") return train_op
def _outputs_with_release(self, handle, inputs, outputs): """Ensures ComputeSession is released before outputs are returned. Args: handle: Handle to ComputeSession on which all computation until now has depended. It will be released and assigned to the output 'run'. inputs: list of nodes we want to pass through without any dependencies. outputs: list of nodes whose access should ensure the ComputeSession is safely released. Returns: A dictionary of both input and output nodes. """ with tf.control_dependencies(outputs.values()): with tf.name_scope('ComputeSession'): release_op = dragnn_ops.release_session(handle) run_op = tf.group(release_op, name='run') for output in outputs: with tf.control_dependencies([release_op]): outputs[output] = tf.identity(outputs[output], name=output) all_nodes = inputs.copy() all_nodes.update(outputs) # Add an alias for simply running without collecting outputs. # Common, for instance, with training. all_nodes['run'] = run_op return all_nodes
def backward_grads(self, y, dy, training=True): """Manually compute backward gradients given input and output grads.""" dy1, dy2 = dy y1, y2 = y with tf.GradientTape() as gtape: gtape.watch(y1) gy1 = self.g(y1, training=training) grads_combined = gtape.gradient( gy1, [y1] + self.g.trainable_variables, output_gradients=dy2) dg = grads_combined[1:] dx1 = dy1 + grads_combined[0] # This doesn't affect eager execution, but improves memory efficiency with # graphs with tf.control_dependencies(dg + [dx1]): x2 = y2 - gy1 with tf.GradientTape() as ftape: ftape.watch(x2) fx2 = self.f(x2, training=training) grads_combined = ftape.gradient( fx2, [x2] + self.f.trainable_variables, output_gradients=dx1) df = grads_combined[1:] dx2 = dy2 + grads_combined[0] # Same behavior as above with tf.control_dependencies(df + [dx2]): x1 = y1 - fx2 x = x1, x2 dx = dx1, dx2 grads = df + dg return x, dx, grads
def loop_body(i): asn1 = tf.assign_add(var_a, 1, name="a_add") with tf.control_dependencies([asn1]): asn2 = tf.assign_add(var_b, var_a, name="b_add") with tf.control_dependencies([asn2]): ni = tf.add(i, 1, name="i_add") return ni
def batch_norm(value, is_train = True, name = 'batch_norm', epsilon = 1e-5, momentum = 0.9): #return value with tf.variable_scope(name, reuse=tf.AUTO_REUSE): ema = tf.train.ExponentialMovingAverage(decay = momentum) shape = value.get_shape().as_list()[-1] beta = bias('beta', [shape], bias_start = 0.0) gamma = bias('gamma', [shape], bias_start = 1.0) if is_train: batch_mean, batch_variance = tf.nn.moments(value, [0, 1, 2], name = 'moments') moving_mean = bias('moving_mean', [shape], 0.0, False) moving_variance = bias('moving_variance', [shape], 1.0, False) ema_apply_op = ema.apply([batch_mean, batch_variance]) assign_mean = moving_mean.assign(ema.average(batch_mean)) assign_variance = \ moving_variance.assign(ema.average(batch_variance)) with tf.control_dependencies([ema_apply_op]): mean, variance = \ tf.identity(batch_mean), tf.identity(batch_variance) with tf.control_dependencies([assign_mean, assign_variance]): return tf.nn.batch_normalization(value, mean, variance, beta, gamma, 1e-5) else: mean = bias('moving_mean', [shape], 0.0, False) variance = bias('moving_variance', [shape], 1.0, False) return tf.nn.batch_normalization(value, mean, variance, beta, gamma, epsilon)
def train(self, total_loss): loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') losses = tf.get_collection('losses') loss_averages_op = loss_averages.apply(losses + [total_loss]) for l in losses + [total_loss]: tf.scalar_summary(l.op.name + ' (raw)', l) # Apply gradients, and add histograms with tf.control_dependencies([loss_averages_op]): opt = tf.train.AdamOptimizer() grads = opt.compute_gradients(total_loss) apply_gradient_op = opt.apply_gradients(grads) for var in tf.trainable_variables(): tf.histogram_summary(var.op.name, var) for grad, var in grads: if grad is not None: tf.histogram_summary(var.op.name + '/gradients', grad) # Track the moving averages of all trainable variables variable_averages = tf.train.ExponentialMovingAverage(Recognizer.MOVING_AVERAGE_DECAY) variables_averages_op = variable_averages.apply(tf.trainable_variables()) with tf.control_dependencies([apply_gradient_op, variables_averages_op]): train_op = tf.no_op(name='train') return train_op
def build_eval_graph(self): # Keep track of the totals while running through the batch data self.total_loss = tf.Variable(0.0, trainable=False, collections=[]) self.total_correct = tf.Variable(0.0, trainable=False, collections=[]) self.example_count = tf.Variable(0.0, trainable=False, collections=[]) # Calculates the means self.mean_loss = self.total_loss / self.example_count self.accuracy = self.total_correct / self.example_count # Operations to modify to the stateful variables inc_total_loss = self.total_loss.assign_add(self.model.total_loss) inc_total_correct = self.total_correct.assign_add( tf.reduce_sum(tf.cast(self.model.correct_predictions, "float"))) inc_example_count = self.example_count.assign_add(self.model.batch_size) # Operation to reset all the stateful vars. Should be called before starting a data set evaluation. with tf.control_dependencies( [self.total_loss.initializer, self.total_correct.initializer, self.example_count.initializer]): self.eval_reset = tf.no_op() # Operation to modify the stateful variables with data from one batch # Should be called for each batch in the evaluatin set with tf.control_dependencies([inc_total_loss, inc_total_correct, inc_example_count]): self.eval_step = tf.no_op() # Summaries summary_mean_loss = tf.scalar_summary("mean_loss", self.mean_loss) summary_acc = tf.scalar_summary("accuracy", self.accuracy) self.summaries = tf.merge_summary([summary_mean_loss, summary_acc])
def _apply(self, grad, var, indices=None): lr = tf.cast(self._learning_rate_tensor, var.dtype.base_dtype) m = self.get_slot(var, "m") v = self.get_slot(var, "v") beta1_t = tf.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = tf.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = tf.cast(self._epsilon_t, var.dtype.base_dtype) # m_t = beta1 * m + (1 - beta1) * g_t m_scaled_g_values = grad * (1 - beta1_t) m_t = tf.assign(m, m * beta1_t, use_locking=self._use_locking) with tf.control_dependencies([m_t]): m_t = self._assign_add(m, updates=m_scaled_g_values, indices=indices) m_gathered = self._gather(m_t, indices=indices) # Also see tf.nn.moments. variance = tf.squared_difference(grad, m_gathered) # v_t = beta2 * v + (1 - beta2) * variance v_scaled_new_values = variance * (1 - beta2_t) v_t = tf.assign(v, v * beta2_t, use_locking=self._use_locking) with tf.control_dependencies([v_t]): v_t = self._assign_add(v, updates=v_scaled_new_values, indices=indices) v_gathered = self._gather(v_t, indices=indices) factor = v_gathered / (variance + epsilon_t) update = lr * grad * tf.minimum(factor, 1.0) var_update = self._assign_sub(ref=var, updates=update, indices=indices) return tf.group(*[var_update, m_t])
def testAssertIntegerForm(self): # This should only be detected as an integer. x = [1., 5, 10, 15, 20] y = [1.1, 5, 10, 15, 20] # First component isn't less than float32.eps = 1e-7 z = [1.0001, 5, 10, 15, 20] # This shouldn"t be detected as an integer. w = [1e-8, 5, 10, 15, 20] with self.test_session(): with tf.control_dependencies([distribution_util.assert_integer_form(x)]): tf.identity(x).eval() with self.assertRaisesOpError("x has non-integer components"): with tf.control_dependencies([ distribution_util.assert_integer_form(y)]): tf.identity(y).eval() with self.assertRaisesOpError("x has non-integer components"): with tf.control_dependencies([ distribution_util.assert_integer_form(z)]): tf.identity(z).eval() with self.assertRaisesOpError("x has non-integer components"): with tf.control_dependencies([ distribution_util.assert_integer_form(w)]): tf.identity(w).eval()
def _define_step(self, done, score, summary): """Combine operations of a phase. Keeps track of the mean score and when to report it. Args: done: Tensor indicating whether current score can be used. score: Tensor holding the current, possibly intermediate, score. summary: Tensor holding summary string to write if not an empty string. Returns: Tuple of summary tensor, mean score, and new global step. The mean score is zero for non reporting steps. """ if done.shape.ndims == 0: done = done[None] if score.shape.ndims == 0: score = score[None] score_mean = streaming_mean.StreamingMean((), tf.float32) with tf.control_dependencies([done, score, summary]): done_score = tf.gather(score, tf.where(done)[:, 0]) submit_score = tf.cond(tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op) with tf.control_dependencies([submit_score]): mean_score = tf.cond(self._report, score_mean.clear, float) steps_made = tf.shape(score)[0] next_step = self._step.assign_add(steps_made) with tf.control_dependencies([mean_score, next_step]): return tf.identity(summary), mean_score, next_step, steps_made
def _training(self): """Perform multiple training iterations of both policy and value baseline. Training on the episodes collected in the memory. Reset the memory afterwards. Always returns a summary string. Returns: Summary tensor. """ with tf.name_scope('training'): assert_full = tf.assert_equal(self._memory_index, self._config.update_every) with tf.control_dependencies([assert_full]): data = self._memory.data() (observ, action, old_mean, old_logstd, reward), length = data with tf.control_dependencies([tf.assert_greater(length, 0)]): length = tf.identity(length) observ = self._observ_filter.transform(observ) reward = self._reward_filter.transform(reward) update_summary = self._perform_update_steps(observ, action, old_mean, old_logstd, reward, length) with tf.control_dependencies([update_summary]): penalty_summary = self._adjust_penalty(observ, old_mean, old_logstd, length) with tf.control_dependencies([penalty_summary]): clear_memory = tf.group(self._memory.clear(), self._memory_index.assign(0)) with tf.control_dependencies([clear_memory]): weight_summary = utility.variable_summaries(tf.trainable_variables(), self._config.weight_summaries) return tf.summary.merge([update_summary, penalty_summary, weight_summary])
def train(total_loss, global_step): total_sample = 274 num_batches_per_epoch = 274/1 """ fix lr """ lr = INITIAL_LEARNING_RATE loss_averages_op = _add_loss_summaries(total_loss) # Compute gradients. with tf.control_dependencies([loss_averages_op]): opt = tf.train.AdamOptimizer(lr) grads = opt.compute_gradients(total_loss) apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) # Add histograms for gradients. for grad, var in grads: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) with tf.control_dependencies([apply_gradient_op, variables_averages_op]): train_op = tf.no_op(name='train') return train_op
def optimize(self, learning_rate, train_layers,global_step,source_centroid,target_centroid): print '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' print train_layers var_list=[v for v in tf.trainable_variables() if v.name.split('/')[1] in ['conv1','conv2','fc1','fc2']] self.Gregloss=5e-4*tf.reduce_mean([tf.nn.l2_loss(x) for x in var_list if 'weights' in x.name]) new_weights=[v for v in var_list if 'weights' in v.name or 'gamma' in v.name] new_biases=[v for v in var_list if 'biases' in v.name or 'beta' in v.name] print '==============new_weights=======================' print new_weights print '==============new_biases=======================' print new_biases self.F_loss=self.loss+self.Gregloss+global_step*self.Semanticloss+global_step*self.G_loss update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) print '+++++++++++++++ batch norm update ops +++++++++++++++++' print update_ops with tf.control_dependencies(update_ops): train_op3=tf.train.MomentumOptimizer(learning_rate*1.0,0.9).minimize(self.F_loss, var_list=new_weights) train_op4=tf.train.MomentumOptimizer(learning_rate*2.0,0.9).minimize(self.F_loss, var_list=new_biases) train_op=tf.group(train_op3,train_op4) with tf.control_dependencies([train_op3,train_op4]): update_sc=self.source_moving_centroid.assign(source_centroid) update_tc=self.target_moving_centroid.assign(target_centroid) return tf.group(update_sc,update_tc)
def _define_experience(self, agent_indices, observ, action, reward): """Implement the branch of experience() entered during training.""" update_filters = tf.summary.merge( [self._observ_filter.update(observ), self._reward_filter.update(reward)]) with tf.control_dependencies([update_filters]): if self._config.train_on_agent_action: # NOTE: Doesn't seem to change much. action = self._last_action batch = (observ, action, tf.gather(self._last_mean, agent_indices), tf.gather(self._last_logstd, agent_indices), reward) append = self._episodes.append(batch, agent_indices) with tf.control_dependencies([append]): norm_observ = self._observ_filter.transform(observ) norm_reward = tf.reduce_mean(self._reward_filter.transform(reward)) # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ update_filters, self._observ_filter.summary(), self._reward_filter.summary(), tf.summary.scalar('memory_size', self._memory_index), tf.summary.histogram('normalized_observ', norm_observ), tf.summary.histogram('action', self._last_action), tf.summary.scalar('normalized_reward', norm_reward) ]), str) return summary
def train(total_loss, global_step, learning_rate=INITIAL_LEARNING_RATE): lr = tf.train.exponential_decay(learning_rate, global_step, DECAY_STEPS,#number of steps required for it to decay LEARNING_RATE_DECAY_FACTOR, staircase=True) tf.scalar_summary('learning_rate', lr) #compute gradient step with tf.control_dependencies([total_loss]): opt = tf.train.MomentumOptimizer(lr, momentum=0.95) grads = opt.compute_gradients(total_loss) #if we wanted to clip the gradients #would apply the operation here #apply the gradients apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) for grad, var in grads: if grad is not None: print("Found gradients for: ", var.op.name) tf.histogram_summary(var.op.name + "/gradients", grad) with tf.control_dependencies([apply_gradient_op]): train_op = tf.no_op(name="train") #opt = tf.train.GradientDescentOptimizer(lr).minimize(total_loss, global_step=global_step) # grads = opt.compute_gradients(total_loss) return train_op
def update(self, value): """Update the mean and variance estimates. Args: value: Batch or single value tensor. Returns: Summary tensor. """ with tf.name_scope(self._name + '/update'): if value.shape.ndims == self._mean.shape.ndims: # Add a batch dimension if necessary. value = value[None, ...] count = tf.shape(value)[0] with tf.control_dependencies([self._count.assign_add(count)]): step = tf.cast(self._count, tf.float32) mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0) new_mean = self._mean + mean_delta / step new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0]) var_delta = (value - self._mean[None, ...]) * (value - new_mean[None, ...]) new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0) with tf.control_dependencies([new_mean, new_var_sum]): update = self._mean.assign(new_mean), self._var_sum.assign(new_var_sum) with tf.control_dependencies(update): if value.shape.ndims == 1: value = tf.reduce_mean(value) return self._summary('value', tf.reduce_mean(value))
def train(total_loss, global_step): """ Create an optimizer and apply to all trainable variables. Add moving average for all trainable variables. Args: total_loss: Total loss from loss(). global_step: Integer Variable counting the number of training steps processed. Returns: train_op: op for training. """ # Compute the moving average of all individual losses and the total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') losses = tf.get_collection('losses') loss_averages_op = loss_averages.apply(losses + [total_loss]) with tf.control_dependencies([loss_averages_op]): opt = tf.train.AdamOptimizer(FLAGS.learning_rate) grads = opt.compute_gradients(total_loss) apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage(0.9999, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) with tf.control_dependencies([apply_gradient_op, variables_averages_op]): train_op = tf.no_op(name='train') return train_op
def append(self, transitions, rows=None): """Append a batch of transitions to rows of the memory. Args: transitions: Tuple of transition quantities with batch dimension. rows: Episodes to append to, defaults to all. Returns: Operation. """ rows = tf.range(self._capacity) if rows is None else rows assert rows.shape.ndims == 1 assert_capacity = tf.assert_less( rows, self._capacity, message='capacity exceeded') with tf.control_dependencies([assert_capacity]): assert_max_length = tf.assert_less( tf.gather(self._length, rows), self._max_length, message='max length exceeded') append_ops = [] with tf.control_dependencies([assert_max_length]): for buffer_, elements in zip(self._buffers, transitions): timestep = tf.gather(self._length, rows) indices = tf.stack([rows, timestep], 1) append_ops.append(tf.scatter_nd_update(buffer_, indices, elements)) with tf.control_dependencies(append_ops): episode_mask = tf.reduce_sum(tf.one_hot( rows, self._capacity, dtype=tf.int32), 0) return self._length.assign_add(episode_mask)
def testCaching(self): """Confirm caching of control output is recacluated between calls.""" a = tf.constant(1) b = tf.constant(2) with tf.control_dependencies([a]): c = tf.constant(42) shared = {} def sub(t): shared[t] = shared.get(t, 0) + 1 return t a = subscribe.subscribe(a, lambda t: tf.py_func(sub, [t], [t.dtype])) with tf.control_dependencies([b]): d = tf.constant(11) # If it was using outdated cached control_outputs then # evaling would not trigger the new subscription. b = subscribe.subscribe(b, lambda t: tf.py_func(sub, [t], [t.dtype])) with self.test_session() as sess: c_out = sess.run([c]) d_out = sess.run([d]) self.assertEquals(c_out, [42]) self.assertEquals(d_out, [11]) self.assertEquals(shared, {2: 1, 1: 1})
def body(i, xs_copy, logprob_prev, grads_prev): ps_init = _init_ps(xs_copy) ps = _update_ps(ps_init, grads_prev, epsilon, coeff=+0.5) max_iters = tf.random_uniform((), minval=lmin, maxval=lmax, dtype=tf.int32) dep_list = _flat([max_iters], ps, ps_init) with tf.control_dependencies(dep_list): leapfrog_result = _leapfrog_step(xs, ps, epsilon, max_iters, logprob_grads_fn) proceed, xs_new, ps_new, logprob_new, grads_new = leapfrog_result dep_list = _flat([proceed], [logprob_new], xs_new, ps_new, grads_new) def standard_proposal(): with tf.control_dependencies(dep_list): return _reject_accept_proposal( xs_new, xs_copy, ps_new, ps_init, logprob_new, logprob_prev, grads_new, grads_prev, epsilon) def premature_reject(): with tf.control_dependencies(dep_list): return _premature_reject( xs_copy, logprob_prev, grads_prev) xs_out, logprob_out, grads_out = tf.cond(proceed, standard_proposal, premature_reject, strict=True) xs_assign = _assign_variables(xs, xs_out) with tf.control_dependencies(xs_assign): xs_out_copy = _copy_variables(xs_assign) with tf.control_dependencies(xs_copy): return i + 1, xs_out_copy, logprob_out, grads_out
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Applying gradients and tune hyperparams with YellowFin. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: (A group of operations) Variable Update with Momentum ops, YellowFin ops(Curvature, Variance, Distance) ops, SingleStep and lr_mu tuning ops, Step increment ops. """ self._grad, self._vars = zip(*[(g, t) for g, t in grads_and_vars if g is not None]) # Var update with Momentum. with tf.variable_scope("apply_updates"): # Gradient Clipping? if self._clip_thresh_var is not None: self._grad, _ = tf.clip_by_global_norm( self._grad, self._clip_thresh_var) apply_grad_op = self._momentum_optimizer.apply_gradients( zip(self._grad, self._vars), global_step=global_step, name=name) else: apply_grad_op = self._momentum_optimizer.apply_gradients( zip(self._grad, self._vars), global_step=global_step, name=name) # Begin lr and mu tuning. with tf.variable_scope("prepare_yellowFin_variables"): # the dependencies ideally only need to be after clip is done, # i.e. depends on self._grads. However, the control_dependencies # does not support indexed slice for sparse gradients. # The alternative dependencies here might be slightly slower due # to less parallelization. with tf.control_dependencies([apply_grad_op,]): prepare_variables_op = self._prepare_variables() with tf.variable_scope("yellowfin"): with tf.control_dependencies([prepare_variables_op]): yellowfin_op = self._yellowfin() # Update YellowFin step variable. with tf.control_dependencies([yellowfin_op]): self._increment_step_op = tf.assign_add(self._step, 1).op return tf.group(apply_grad_op, prepare_variables_op, yellowfin_op, self._increment_step_op)
def test_train_skip_train_if_max_step_already_saved(self): with tf.Graph().as_default() as g, self.test_session(g): with tf.control_dependencies(self._build_inference_graph()): train_op = tf.assign_add(tf.contrib.framework.get_global_step(), 1) learn.graph_actions._monitored_train( # pylint: disable=protected-access g, output_dir=self._output_dir, train_op=train_op, loss_op=tf.constant(2.0), max_steps=10) step = checkpoints.load_variable( self._output_dir, tf.contrib.framework.get_global_step().name) self.assertEqual(10, step) with tf.Graph().as_default() as g, self.test_session(g): with tf.control_dependencies(self._build_inference_graph()): train_op = tf.assign_add(tf.contrib.framework.get_global_step(), 1) learn.graph_actions._monitored_train( # pylint: disable=protected-access g, output_dir=self._output_dir, train_op=train_op, loss_op=tf.constant(2.0), max_steps=10) step = checkpoints.load_variable( self._output_dir, tf.contrib.framework.get_global_step().name) self.assertEqual(10, step)
def replace(self, episodes, length, rows=None): """Replace full episodes. Args: episodes: Tuple of transition quantities with batch and time dimensions. length: Batch of sequence lengths. rows: Episodes to replace, defaults to all. Returns: Operation. """ rows = tf.range(self._capacity) if rows is None else rows assert rows.shape.ndims == 1 assert_capacity = tf.assert_less( rows, self._capacity, message='capacity exceeded') with tf.control_dependencies([assert_capacity]): assert_max_length = tf.assert_less_equal( length, self._max_length, message='max length exceeded') replace_ops = [] with tf.control_dependencies([assert_max_length]): for buffer_, elements in zip(self._buffers, episodes): replace_op = tf.scatter_update(buffer_, rows, elements) replace_ops.append(replace_op) with tf.control_dependencies(replace_ops): return tf.scatter_update(self._length, rows, length)
def _dist_to_opt(self): """Distance to optimum. Returns: D_t ops """ dist_to_opt_ops = [] # Running average of the norm of gradient self._grad_norm = tf.sqrt(self._grad_norm_squared) avg_op = self._moving_averager.apply([self._grad_norm,]) dist_to_opt_ops.append(avg_op) with tf.control_dependencies([avg_op]): self._grad_norm_avg = self._moving_averager.average(self._grad_norm) # Single iteration distance estimation, note here # self._grad_norm_avg is per variable self._d_t = self._grad_norm_avg / self._grad_norm_squared_avg # Running average of distance avg_op = self._moving_averager.apply([self._d_t]) dist_to_opt_ops.append(avg_op) with tf.control_dependencies([avg_op]): self._dist_to_opt_avg = tf.identity( self._moving_averager.average(self._d_t)) if self._sparsity_debias: self._dist_to_opt_avg /= tf.sqrt(self._sparsity_avg) return dist_to_opt_ops # D_t
def get_best(self, n): """Return the indices and values of the n highest scores in the TopN.""" def refresh_shortlist(): """Update the shortlist with the highest scores in id_to_score.""" new_scores, new_ids = tf.nn.top_k(self.id_to_score, self.shortlist_size) smallest_new_score = tf.reduce_min(new_scores) new_length = tf.reduce_sum( tf.to_int32(tf.greater(new_scores, tf.float32.min))) u1 = self.sl_ids.assign( tf.to_int64(tf.concat_v2([[new_length], new_ids], 0))) u2 = self.sl_scores.assign( tf.concat_v2([[smallest_new_score], new_scores], 0)) self.last_ops = [u1, u2] return tf.group(u1, u2) # We only need to refresh the shortlist if n is greater than the # current shortlist size (which is stored in sl_ids[0]). with tf.control_dependencies(self.last_ops): cond_op = tf.cond(n > self.sl_ids[0], refresh_shortlist, tf.no_op) with tf.control_dependencies([cond_op]): topk_values, topk_indices = tf.nn.top_k( self.sl_scores, tf.minimum(n, tf.to_int32(self.sl_ids[0]))) # topk_indices are the indices into the shortlist, we want to return # the indices into id_to_score gathered_indices = tf.gather(self.sl_ids, topk_indices) return gathered_indices, topk_values
def box_voting(selected_boxes, pool_boxes, iou_thresh=0.5): """Performs box voting as described in S. Gidaris and N. Komodakis, ICCV 2015. Performs box voting as described in 'Object detection via a multi-region & semantic segmentation-aware CNN model', Gidaris and Komodakis, ICCV 2015. For each box 'B' in selected_boxes, we find the set 'S' of boxes in pool_boxes with iou overlap >= iou_thresh. The location of B is set to the weighted average location of boxes in S (scores are used for weighting). And the score of B is set to the average score of boxes in S. Args: selected_boxes: BoxList containing a subset of boxes in pool_boxes. These boxes are usually selected from pool_boxes using non max suppression. pool_boxes: BoxList containing a set of (possibly redundant) boxes. iou_thresh: (float scalar) iou threshold for matching boxes in selected_boxes and pool_boxes. Returns: BoxList containing averaged locations and scores for each box in selected_boxes. Raises: ValueError: if a) selected_boxes or pool_boxes is not a BoxList. b) if iou_thresh is not in [0, 1]. c) pool_boxes does not have a scores field. """ if not 0.0 <= iou_thresh <= 1.0: raise ValueError('iou_thresh must be between 0 and 1') if not isinstance(selected_boxes, box_list.BoxList): raise ValueError('selected_boxes must be a BoxList') if not isinstance(pool_boxes, box_list.BoxList): raise ValueError('pool_boxes must be a BoxList') if not pool_boxes.has_field('scores'): raise ValueError('pool_boxes must have a \'scores\' field') iou_ = iou(selected_boxes, pool_boxes) match_indicator = tf.cast(tf.greater(iou_, iou_thresh), dtype=tf.float32) num_matches = tf.reduce_sum(match_indicator, 1) # TODO(kbanoop): Handle the case where some boxes in selected_boxes do not # match to any boxes in pool_boxes. For such boxes without any matches, we # should return the original boxes without voting. match_assert = tf.Assert(tf.reduce_all(tf.greater(num_matches, 0)), [ 'Each box in selected_boxes must match with at least one box ' 'in pool_boxes.' ]) scores = tf.expand_dims(pool_boxes.get_field('scores'), 1) scores_assert = tf.Assert(tf.reduce_all(tf.greater_equal(scores, 0)), ['Scores must be non negative.']) with tf.control_dependencies([scores_assert, match_assert]): sum_scores = tf.matmul(match_indicator, scores) averaged_scores = tf.reshape(sum_scores, [-1]) / num_matches box_locations = tf.matmul(match_indicator, pool_boxes.get() * scores) / sum_scores averaged_boxes = box_list.BoxList(box_locations) _copy_extra_fields(averaged_boxes, selected_boxes) averaged_boxes.add_field('scores', averaged_scores) return averaged_boxes
def sac_n_step(env_name, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, n_step=5, batch_size=100, start_steps=10000, without_delay_train=False, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = gym.make(env_name), gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph = core.placeholders(obs_dim, act_dim, obs_dim) # r_ph = tf.placeholder(dtype=tf.float32, shape=(None, n_step)) # d_ph = tf.placeholder(dtype=tf.float32, shape=(None, n_step)) r_ph = tf.placeholder(dtype=tf.float32, shape=(None, None)) d_ph = tf.placeholder(dtype=tf.float32, shape=(None, None)) n_step_ph = tf.placeholder(dtype=tf.float32, shape=()) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts) # Targets for Q and V regression # q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ) # q_backup = tf.stop_gradient( # tf.reduce_sum(tf.multiply([gamma ** (i) for i in range(n_step)] * (1 - d_ph), r_ph), axis=1) # + gamma ** n_step * (1 - d_ph[:, -1]) * v_targ) q_backup = tf.stop_gradient( tf.reduce_sum(tf.multiply(tf.pow(gamma, tf.range(0, n_step_ph)) * (1 - tf.slice(d_ph, [0, 0], [batch_size, n_step])), r_ph), axis=1) + gamma ** n_step_ph * (1 - tf.reshape(tf.slice(d_ph, [0, n_step], [batch_size, 1]), [-1])) * v_targ) v_backup = tf.stop_gradient(q1_pi - alpha * logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2) v_loss = 0.5 * tf.reduce_mean((v_backup - v) ** 2) value_loss = q1_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, v_loss, q1, v, logp_pi, train_pi_op, train_value_op, target_update] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'v': v}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if t > batch_size and without_delay_train: # batch = replay_buffer.sample_batch(batch_size) batch = replay_buffer.sample_batch_n_step(batch_size, n_step=n_step) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], n_step_ph: n_step } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossV=outs[2], Q1Vals=outs[3], VVals=outs[4], LogPi=outs[5]) if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ if not without_delay_train: for j in range(ep_len): # batch = replay_buffer.sample_batch(batch_size) batch = replay_buffer.sample_batch_n_step(batch_size, n_step=n_step) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], n_step_ph: n_step } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossV=outs[2], Q1Vals=outs[3], VVals=outs[4], LogPi=outs[5]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # # Save model # if (epoch % save_freq == 0) or (epoch == epochs - 1): # logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def train(): # Makes this the default graph where all ops will be added with tf.Graph().as_default(), tf.device('/gpu:' + str(FLAGS.GPU)): # Load the images and labels. data, _ = network.inputs(skip=True) # Define phase of training phase_train = tf.placeholder(tf.bool) # Perform the forward pass: logits, l2loss = network.forward_pass_res(data['image_data'], phase_train=phase_train) # Calculate loss SCE_loss = network.total_loss(logits, data['label_data'], loss_type='DICE') # Add the L2 regularization loss loss = tf.add(SCE_loss, l2loss, name='TotalLoss') # Update the moving average batch norm ops extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # Retreive the training operation with the applied gradients with tf.control_dependencies(extra_update_ops): train_op = network.backward_pass(loss) # ------------------- Housekeeping functions ---------------------- # Merge the summaries all_summaries = tf.summary.merge_all() # Initialize variables operation var_init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Restore moving average of the variables var_ema = tf.train.ExponentialMovingAverage(FLAGS.moving_avg_decay) # Define variables to restore var_restore = var_ema.variables_to_restore() # Initialize the saver saver = tf.train.Saver(var_restore, max_to_keep=4) # ------------------- Session Initializer ---------------------- # Set the intervals max_steps = int( (FLAGS.epoch_size / FLAGS.batch_size) * FLAGS.num_epochs) print_interval = int( (FLAGS.epoch_size / FLAGS.batch_size) * FLAGS.print_interval) checkpoint_interval = int( (FLAGS.epoch_size / FLAGS.batch_size) * FLAGS.checkpoint_interval) print('Max Steps: %s, Print Interval: %s, Checkpoint: %s' % (max_steps, print_interval, checkpoint_interval)) # Allow memory placement growth config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as mon_sess: # Initialize the variables mon_sess.run(var_init) # Initialize the handle to the summary writer in our training directory summary_writer = tf.summary.FileWriter( FLAGS.train_dir + FLAGS.RunInfo, mon_sess.graph) # Initialize the step counter timer = 0 # Use slim to handle queues: with slim.queues.QueueRunners(mon_sess): for i in range(max_steps): # Run and time an iteration start = time.time() mon_sess.run(train_op, feed_dict={phase_train: True}) timer += (time.time() - start) # Calculate current epoch Epoch = int((i * FLAGS.batch_size) / FLAGS.epoch_size) # Console and Tensorboard print interval if i % print_interval == 0: # First retreive the loss values l2, sce, tot = mon_sess.run( [l2loss, SCE_loss, loss], feed_dict={phase_train: True}) tot *= 1e6 l2 *= 1e6 sce *= 1e6 # Get timing stats elapsed = timer / print_interval timer = 0 # Calc epoch Epoch = int((i * FLAGS.batch_size) / FLAGS.epoch_size) # Now print the loss values print('-' * 70) print( 'Epoch: %s, Time: %.1f sec, L2 Loss (ppm): %.4f, Prediction Loss (ppm): %.4f, Total Loss (ppm): %.4f, Eg/s: %.4f, Seconds Per: %.4f' % (Epoch, elapsed, l2, sce, tot, FLAGS.batch_size / elapsed, elapsed / FLAGS.batch_size)) # Run a session to retrieve our summaries summary = mon_sess.run(all_summaries, feed_dict={phase_train: True}) # Add the summaries to the protobuf for Tensorboard summary_writer.add_summary(summary, i) # Timer start_time = time.time() if i % checkpoint_interval == 0: print( '-' * 70, '\nSaving... GPU: %s, File:%s' % (FLAGS.GPU, FLAGS.RunInfo[:-1])) # Define the filename file = ('Epoch_%s' % Epoch) # Define the checkpoint file: checkpoint_file = os.path.join( FLAGS.train_dir + FLAGS.RunInfo, file) # Save the checkpoint saver.save(mon_sess, checkpoint_file)
def main(_): # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) # Start a new TensorFlow session. sess = tf.InteractiveSession() # Begin by making sure we have the training data we need. If you already have # training data of your own, use `--data_url= ` on the command line to avoid # downloading. model_settings = models.prepare_model_settings( len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) fingerprint_size = model_settings['fingerprint_size'] # fingerprint_size가 뭘까 # 일단 모델 세팅한 후에 나온 값이고, 위에서 모델의 설정과 오디오 처리기에 대한 정의는 해둠 label_count = model_settings['label_count'] time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000) # Range to randomly shift the training audio by in time. # time shifting을 적용, 전처리 과정으로 생각하면 될 듯하다. # pitch shifting 느낌 # Figure out the learning rates for each training phase. Since it's often # effective to have high learning rates at the start of training, followed by # lower levels towards the end, the number of steps and learning rates can be # specified as comma-separated lists to define the rate at each stage. For # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001 # will run 13,000 training loops in total, with a rate of 0.001 for the first # 10,000, and 0.0001 for the final 3,000. training_steps_list = list( map(int, FLAGS.how_many_training_steps.split(','))) learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') logits, dropout_prob = models.create_model(fingerprint_input, model_settings, FLAGS.model_architecture, is_training=True) # 모델의 세팅과 입력, 모델에서 사용할 구조(conv, ...) # Define loss and optimizer ground_truth_input = tf.placeholder(tf.int64, [None], name='groundtruth_input') # Optionally we can add runtime checks to spot when NaNs or other symptoms of # numerical errors start occurring during training. control_dependencies = [] if FLAGS.check_nans: checks = tf.add_check_numerics_ops() control_dependencies = [checks] # Create the back propagation and training evaluation machinery in the graph. with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy( labels=ground_truth_input, logits=logits) tf.summary.scalar('cross_entropy', cross_entropy_mean) with tf.name_scope('train'), tf.control_dependencies(control_dependencies): learning_rate_input = tf.placeholder(tf.float32, [], name='learning_rate_input') train_step = tf.train.GradientDescentOptimizer( learning_rate_input).minimize(cross_entropy_mean) predicted_indices = tf.argmax(logits, 1) correct_prediction = tf.equal(predicted_indices, ground_truth_input) confusion_matrix = tf.confusion_matrix(ground_truth_input, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', evaluation_step) global_step = tf.train.get_or_create_global_step() increment_global_step = tf.assign(global_step, global_step + 1) saver = tf.train.Saver(tf.global_variables()) # Merge all the summaries and write them out to /tmp/retrain_logs (by default) merged_summaries = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation') tf.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) start_step = global_step.eval(session=sess) tf.logging.info('Training from step: %d ', start_step) # Save graph.pbtxt. tf.train.write_graph(sess.graph_def, FLAGS.train_dir, FLAGS.model_architecture + '.pbtxt') # Save list of words. with gfile.GFile( os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'), 'w') as f: f.write('\n'.join(audio_processor.words_list)) # The main roles of the tf.gfile module are : # 1. To provide an API that is close to Python's file objects # 2. To provide an implementation based on Tensorflow's C++ FileSystem API # C++ FileSystem API Supports multiple file system implementations, including local files, Google Cloud storage, and HDFS # these implementations for saving and loading checkpoints, wirting Tensorboard logs, and accessing training data # However, if all of your files are local, you can use the regular Python file API without any problem. # less-conventional filesystem에 필요하고, 그 외에는 일반적으로 사용하는 파이썬 API를 사용해도 된다. # Training loop. training_steps_max = np.sum(training_steps_list) for training_step in xrange(start_step, training_steps_max + 1): # Figure out what the current learning rate is. # xrange는 ranage 함수와 차이가 있는데, 데이터 타입이 다르고 동작 방식이 다르다. # xrange를 사용하는 경우가 지정하는 범위가 커질 경우 메모리 사용 효율이 커지게 된다. # 자신에 속한 데이터 값을 한꺼번에 메모리에 로드하는 것이 아니라 해당 값에 접근할 때 마다 그 값을 하나씩 로딩하는 방식 # list에서 제공하는 편리한 함수를 못쓰지만, 순차적 접근이나 index를 통한 접근을 위주로 할 때는 xrange()가 훨씬 메모리 효율적이다. training_steps_sum = 0 for i in range(len(training_steps_list)): training_steps_sum += training_steps_list[i] if training_step <= training_steps_sum: learning_rate_value = learning_rates_list[i] break # Pull the audio samples we'll use for training. train_fingerprints, train_ground_truth = audio_processor.get_data( FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency, FLAGS.background_volume, time_shift_samples, 'training', sess) # Run the graph with this batch of training data. train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run( [ merged_summaries, evaluation_step, cross_entropy_mean, train_step, increment_global_step ], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth, learning_rate_input: learning_rate_value, dropout_prob: 0.5 }) train_writer.add_summary(train_summary, training_step) tf.logging.info( 'Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' % (training_step, learning_rate_value, train_accuracy * 100, cross_entropy_value)) is_last_step = (training_step == training_steps_max) if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step: set_size = audio_processor.set_size('validation') total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) # Run a validation step and capture training summaries for TensorBoard # with the `merged` op. validation_summary, validation_accuracy, conf_matrix = sess.run( [merged_summaries, evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, dropout_prob: 1.0 }) validation_writer.add_summary(validation_summary, training_step) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' % (training_step, total_accuracy * 100, set_size)) # Save the model checkpoint periodically. if (training_step % FLAGS.save_step_interval == 0 or training_step == training_steps_max): checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): test_fingerprints, test_ground_truth = audio_processor.get_data( FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess) test_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, dropout_prob: 1.0 }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100, set_size))
def train(self): # Instantiate the dataset class data = dataset_badGAN( num_classes=F.num_classes, extraction_step=self.extraction_step, number_images_training=F.number_train_images, batch_size=F.batch_size, patch_shape=self.patch_shape, number_unlab_images_training=F.number_train_unlab_images, data_directory=F.data_directory) # Optimizer operations update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): d_optim = tf.train.AdamOptimizer(F.learning_rate_D, beta1=F.beta1D)\ .minimize(self.d_loss,var_list=self.d_vars) g_optim = tf.train.AdamOptimizer(F.learning_rate_G, beta1=F.beta1G)\ .minimize(self.g_loss,var_list=self.g_vars) if F.badGAN: e_optim = tf.train.AdamOptimizer(F.learning_rate_E, beta1=F.beta1E)\ .minimize(self.g_loss,var_list=self.e_vars) tf.global_variables_initializer().run() # Load checkpoints if required if F.load_chkpt: try: load_model(F.checkpoint_dir, self.sess, self.saver) print("\n [*] Checkpoint loaded succesfully!") except: print("\n [!] Checkpoint loading failed!") else: print("\n [*] Checkpoint load not required.") # Load the validation data patches_val, labels_val_patch, labels_val = preprocess_dynamic_lab( F.data_directory, F.num_classes, self.extraction_step, self.patch_shape, F.number_train_images, validating=F.training, testing=F.testing, num_images_testing=F.number_test_images) predictions_val = np.zeros((patches_val.shape[0], self.patch_shape[0], self.patch_shape[1], self.patch_shape[2]), dtype="uint8") max_par = 0.0 max_loss = 100 for epoch in xrange(int(F.epoch)): idx = 0 batch_iter_train = data.batch_train() total_val_loss = 0 total_train_loss_CE = 0 total_train_loss_UL = 0 total_train_loss_FK = 0 total_gen_FMloss = 0 for patches_lab, patches_unlab, labels in batch_iter_train: # Network update sample_z_gen = np.random.uniform( -1, 1, [F.batch_size, F.noise_dim]).astype(np.float32) _ = self.sess.run(d_optim, feed_dict={ self.patches_lab: patches_lab, self.patches_unlab: patches_unlab, self.z_gen: sample_z_gen, self.labels: labels, self.phase: True }) if F.badGAN: _, _ = self.sess.run( [e_optim, g_optim], feed_dict={ self.patches_unlab: patches_unlab, self.z_gen: sample_z_gen, self.z_gen: sample_z_gen, self.phase: True }) else: _ = self.sess.run(g_optim, feed_dict={ self.patches_unlab: patches_unlab, self.z_gen: sample_z_gen, self.z_gen: sample_z_gen, self.phase: True }) feed_dict = { self.patches_lab: patches_lab, self.patches_unlab: patches_unlab, self.z_gen: sample_z_gen, self.labels: labels, self.phase: True } # Evaluate losses for plotting/printing purposes d_loss_lab = self.d_loss_lab.eval(feed_dict) d_loss_unlab_true = self.true_loss.eval(feed_dict) d_loss_unlab_fake = self.fake_loss.eval(feed_dict) g_loss_fm = self.g_loss_fm.eval(feed_dict) total_train_loss_CE = total_train_loss_CE + d_loss_lab total_train_loss_UL = total_train_loss_UL + d_loss_unlab_true total_train_loss_FK = total_train_loss_FK + d_loss_unlab_fake total_gen_FMloss = total_gen_FMloss + g_loss_fm idx += 1 if F.badGAN: vi_loss = self.vi_loss.eval(feed_dict) print(( "Epoch:[%2d] [%4d/%4d] Labeled loss:%.2e Unlabeled loss:%.2e Fake loss:%.2e Generator FM loss:%.8f Generator VI loss:%.8f\n" ) % (epoch, idx, data.num_batches, d_loss_lab, d_loss_unlab_true, d_loss_unlab_fake, g_loss_fm, vi_loss)) else: print(( "Epoch:[%2d] [%4d/%4d] Labeled loss:%.2e Unlabeled loss:%.2e Fake loss:%.2e Generator loss:%.8f \n" ) % (epoch, idx, data.num_batches, d_loss_lab, d_loss_unlab_true, d_loss_unlab_fake, g_loss_fm)) # Save the curret model save_model(F.checkpoint_dir, self.sess, self.saver) avg_train_loss_CE = total_train_loss_CE / (idx * 1.0) avg_train_loss_UL = total_train_loss_UL / (idx * 1.0) avg_train_loss_FK = total_train_loss_FK / (idx * 1.0) avg_gen_FMloss = total_gen_FMloss / (idx * 1.0) print('\n\n') total_batches = int(patches_val.shape[0] / F.batch_size) print("Total number of batches for validation: ", total_batches) # Prediction of validation patches for batch in range(total_batches): patches_feed = patches_val[batch * F.batch_size:(batch + 1) * F.batch_size, :, :, :, :] labels_feed = labels_val_patch[batch * F.batch_size:(batch + 1) * F.batch_size, :, :, :] feed_dict = { self.patches_lab: patches_feed, self.labels: labels_feed, self.phase: False } preds = self.Val_output.eval(feed_dict) val_loss = self.d_loss_lab.eval(feed_dict) predictions_val[batch * F.batch_size:(batch + 1) * F.batch_size, :, :, :] = preds print(("Validated Patch:[%8d/%8d]") % (batch, total_batches)) total_val_loss = total_val_loss + val_loss # To compute average patchvise validation loss(cross entropy loss) avg_val_loss = total_val_loss / (total_batches * 1.0) print("All validation patches Predicted") print("Shape of predictions_val, min and max:", predictions_val.shape, np.min(predictions_val), np.max(predictions_val)) # To stitch back the patches into an entire image val_image_pred = recompose3D_overlap(predictions_val, 144, 192, 256, self.extraction_step[0], self.extraction_step[1], self.extraction_step[2]) val_image_pred = val_image_pred.astype('uint8') print("Shape of Predicted Output Groundtruth Images:", val_image_pred.shape, np.unique(val_image_pred), np.unique(labels_val), np.mean(val_image_pred), np.mean(labels_val)) pred2d = np.reshape(val_image_pred, (val_image_pred.shape[0] * 144 * 192 * 256)) lab2d = np.reshape(labels_val, (labels_val.shape[0] * 144 * 192 * 256)) # For printing the validation results F1_score = f1_score(lab2d, pred2d, [0, 1, 2, 3], average=None) print("Validation Dice Coefficient.... ") print("Background:", F1_score[0]) print("CSF:", F1_score[1]) print("GM:", F1_score[2]) print("WM:", F1_score[3]) # To Save the best model if (max_par < (F1_score[2] + F1_score[3])): max_par = (F1_score[2] + F1_score[3]) save_model(F.best_checkpoint_dir, self.sess, self.saver) print("Best checkpoint updated from validation results.") # To save the losses for plotting print("Average Validation Loss:", avg_val_loss) with open('Val_loss_GAN.txt', 'a') as f: f.write('%.2e \n' % avg_val_loss) with open('Train_loss_CE.txt', 'a') as f: f.write('%.2e \n' % avg_train_loss_CE) with open('Train_loss_UL.txt', 'a') as f: f.write('%.2e \n' % avg_train_loss_UL) with open('Train_loss_FK.txt', 'a') as f: f.write('%.2e \n' % avg_train_loss_FK) with open('Train_loss_FM.txt', 'a') as f: f.write('%.2e \n' % avg_gen_FMloss) return
def __init__(self, config, name): assert name in ('validation', 'training', 'test') self.name = name logging.debug('{} - model - initialize'.format(self.name)) self.is_training = True if self.name == 'training' else False self.config = config if not self.is_training: self.reinitializable_iter_for_dataset = None self.batch = self._gen_batch_fn() # generate mini-batch with tf.name_scope(self.name): with tf.variable_scope('full_conv', reuse=tf.AUTO_REUSE): logits_stereo = self._nn_model_fn() logits_stereo_flattened = flatten_maybe_padded_sequences( maybe_padded_sequences=logits_stereo, lengths=tf.tile(input=self.batch['num_frames'], multiples=[2])) logits_left_flattened, logits_right_flattened = tf.split( value=logits_stereo_flattened, num_or_size_splits=2, axis=0) logits_minor_flattened = tf.minimum(logits_left_flattened, logits_right_flattened) logits_larger_flattened = tf.maximum(logits_left_flattened, logits_right_flattened) labels_bool_flattened = flatten_maybe_padded_sequences( maybe_padded_sequences=self.batch['label'], lengths=self.batch['num_frames']) negated_labels_bool_flattened = tf.logical_not(labels_bool_flattened) labels_float_flattened = tf.cast(x=labels_bool_flattened, dtype=tf.float32) logits_mono_flattened = tf.where( tf.equal(labels_bool_flattened, True), logits_minor_flattened, logits_larger_flattened) loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_float_flattened, logits=logits_mono_flattened) loss = tf.reduce_mean(loss) if self.is_training: _update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if _update_ops: with tf.control_dependencies(_update_ops): training_op = tf.train.AdamOptimizer(self.config.learning_rate).minimize(loss) else: training_op = tf.train.AdamOptimizer(self.config.learning_rate).minimize(loss) pred_labels_flattened = tf.greater(logits_left_flattened + logits_right_flattened, 0.) negated_pred_labels_flattened = tf.logical_not(pred_labels_flattened) # individual and ensemble statistics for test and validation if not self.is_training: with tf.name_scope('individual_and_ensemble_stats'): with tf.variable_scope( '{}_local_vars'.format(self.name), reuse=tf.AUTO_REUSE): individual_tps_fps_tns_fns_var = tf.get_variable( name='individual_tps_fps_tns_fns', shape=[len(self.config.file_names[self.name]), 4], dtype=tf.int32, initializer=tf.zeros_initializer, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) acc_loss_var = tf.get_variable( name='acc_loss', shape=[], dtype=tf.float32, initializer=tf.zeros_initializer, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) batch_counter_var = tf.get_variable( name='batch_counter', shape=[], dtype=tf.int32, initializer=tf.zeros_initializer, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) loop_var_proto = collections.namedtuple( 'loop_var_proto', ['sample_idx', 'batch_size', 'preds', 'negated_preds', 'labels', 'negated_labels', 'lengths', 'me_ids']) def cond_fn(loop_var): return tf.less(loop_var.sample_idx, loop_var.batch_size) def body_fn(loop_var): start_pos = tf.reduce_sum(loop_var.lengths[:loop_var.sample_idx]) end_pos = start_pos + loop_var.lengths[loop_var.sample_idx] cur_preds = loop_var.preds negated_cur_preds = loop_var.negated_preds cur_labels = loop_var.labels negated_cur_labels = loop_var.negated_labels cur_preds, negated_cur_preds, cur_labels, negated_cur_labels = \ [value[start_pos:end_pos] for value in [cur_preds, negated_cur_preds, cur_labels, negated_cur_labels]] tps = tf.logical_and(cur_preds, cur_labels) fps = tf.logical_and(cur_preds, negated_cur_labels) tns = tf.logical_and(negated_cur_preds, negated_cur_labels) fns = tf.logical_and(negated_cur_preds, cur_labels) tps, fps, tns, fns = \ [tf.reduce_sum(tf.cast(value, tf.int32)) for value in [tps, fps, tns, fns]] me_id = loop_var.me_ids[loop_var.sample_idx] stats_var = individual_tps_fps_tns_fns_var _new_value = stats_var[me_id] + tf.convert_to_tensor([tps, fps, tns, fns]) _update_stats = tf.scatter_update( stats_var, me_id, _new_value, use_locking=True) with tf.control_dependencies([_update_stats]): sample_idx = loop_var.sample_idx + 1 loop_var = loop_var_proto( sample_idx=sample_idx, batch_size=loop_var.batch_size, preds=loop_var.preds, negated_preds=loop_var.negated_preds, labels=loop_var.labels, negated_labels=loop_var.negated_labels, lengths=loop_var.lengths, me_ids=loop_var.me_ids ) return [loop_var] sample_idx = tf.constant(0, dtype=tf.int32) cur_batch_size = tf.shape(self.batch['num_frames'])[0] loop_var = loop_var_proto( sample_idx=sample_idx, batch_size=cur_batch_size, preds=pred_labels_flattened, negated_preds=negated_pred_labels_flattened, labels=labels_bool_flattened, negated_labels=negated_labels_bool_flattened, lengths=self.batch['num_frames'], me_ids=self.batch['me_id'] ) final_sample_idx = tf.while_loop( cond=cond_fn, body=body_fn, loop_vars=[loop_var], parallel_iterations=self.config.batch_size, back_prop=False, return_same_structure=True )[0].sample_idx individual_tps_fps_tns_fns_float = tf.cast(individual_tps_fps_tns_fns_var, tf.float32) tps, fps, _, fns = tf.unstack(individual_tps_fps_tns_fns_float, axis=1) me_wise_precisions = tps / (tps + fps + 1e-7) me_wise_recalls = tps / (tps + fns + 1e-7) me_wise_f1s = 2. * me_wise_precisions * me_wise_recalls / \ (me_wise_precisions + me_wise_recalls + 1e-7) me_wise_prfs = tf.stack([me_wise_precisions, me_wise_recalls, me_wise_f1s], axis=1) assert me_wise_prfs.shape.as_list() == [len(self.config.file_names[self.name]), 3] average_me_wise_prf = tf.reduce_mean(me_wise_prfs, axis=0) assert average_me_wise_prf.shape.as_list() == [3] # ensemble stats ensemble_tps_fps_tns_fns = tf.reduce_sum(individual_tps_fps_tns_fns_var, axis=0) tps, fps, _, fns = tf.unstack(tf.cast(ensemble_tps_fps_tns_fns, tf.float32)) en_precision = tps / (tps + fps + 1e-7) en_recall = tps / (tps + fns + 1e-7) en_f1 = 2. * en_precision * en_recall / (en_precision + en_recall + 1e-7) batch_counter_update_op = tf.assign_add(batch_counter_var, 1) acc_loss_update_op = tf.assign_add(acc_loss_var, loss) ensemble_prf_and_loss = tf.convert_to_tensor( [en_precision, en_recall, en_f1, acc_loss_var / tf.cast(batch_counter_var, tf.float32)]) update_op_after_each_batch = tf.group( final_sample_idx, batch_counter_update_op, acc_loss_update_op, name='grouped update ops to be run after each batch'.replace(' ', '_')) stats_after_each_epoch = dict( individual_tps_fps_tns_fns=individual_tps_fps_tns_fns_var, individual_prfs=me_wise_prfs, ensemble_tps_fps_tns_fns=ensemble_tps_fps_tns_fns, ensemble_prf_and_loss=ensemble_prf_and_loss, average_prf=average_me_wise_prf ) # ensemble stats for training if self.is_training: with tf.name_scope('ensemble_stats'): with tf.variable_scope( '{}_local_vars'.format(self.name), reuse=tf.AUTO_REUSE): ensemble_tps_fps_tns_fns_var = tf.get_variable( name='ensemble_tps_fps_tns_fns', shape=[4], dtype=tf.int32, initializer=tf.zeros_initializer, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) acc_loss_var = tf.get_variable( name='acc_loss', shape=[], dtype=tf.float32, initializer=tf.zeros_initializer, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) batch_counter_var = tf.get_variable( name='batch_counter', shape=[], dtype=tf.int32, initializer=tf.zeros_initializer, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) tps = tf.logical_and(pred_labels_flattened, labels_bool_flattened) fps = tf.logical_and(pred_labels_flattened, negated_labels_bool_flattened) tns = tf.logical_and(negated_pred_labels_flattened, negated_labels_bool_flattened) fns = tf.logical_and(negated_pred_labels_flattened, labels_bool_flattened) tps, fps, tns, fns = [tf.reduce_sum(tf.cast(value, tf.int32)) for value in [tps, fps, tns, fns]] ensemble_tps_fps_tns_fns_update_op = tf.assign_add( ensemble_tps_fps_tns_fns_var, tf.convert_to_tensor([tps, fps, tns, fns])) acc_loss_update_op = tf.assign_add(acc_loss_var, loss) batch_counter_update_op = tf.assign_add(batch_counter_var, 1) ensemble_tps_fps_tns_fns_float = tf.cast(ensemble_tps_fps_tns_fns_var, tf.float32) tps, fps, _, fns = tf.unstack(ensemble_tps_fps_tns_fns_float) ensemble_precision = tps / (tps + fps + 1e-7) ensemble_recall = tps / (tps + fns + 1e-7) ensemble_f1 = 2. * ensemble_precision * ensemble_recall / \ (ensemble_precision + ensemble_recall + 1e-7) ensemble_loss = acc_loss_var / tf.cast(batch_counter_var, tf.float32) ensemble_prf_and_loss = tf.convert_to_tensor( [ensemble_precision, ensemble_recall, ensemble_f1, ensemble_loss]) update_op_after_each_batch = tf.group( batch_counter_update_op, ensemble_tps_fps_tns_fns_update_op, acc_loss_update_op) stats_after_each_epoch = dict( ensemble_tps_fps_tns_fns=ensemble_tps_fps_tns_fns_var, ensemble_prf_and_loss=ensemble_prf_and_loss ) # define tensorboard summaries with tf.name_scope('tensorboard_summary'): with tf.name_scope('statistics'): if not self.is_training: list_of_summaries = [] with tf.name_scope('ensemble'): p, r, f, lo = tf.unstack(stats_after_each_epoch['ensemble_prf_and_loss']) items_for_summary = dict(precision=p, recall=r, f1=f, average_loss=lo) for item_name, item_value in items_for_summary.iteritems(): tmp = tf.summary.scalar(item_name, item_value) list_of_summaries.append(tmp) with tf.name_scope('individual'): p, r, f = tf.unstack(stats_after_each_epoch['average_prf']) items_for_summary = dict(precision=p, recall=r, f1=f) for item_name, item_value in items_for_summary.iteritems(): tmp = tf.summary.scalar(item_name, item_value) list_of_summaries.append(tmp) else: list_of_summaries = [] with tf.name_scope('ensemble'): p, r, f, lo = tf.unstack(stats_after_each_epoch['ensemble_prf_and_loss']) items_for_summary = dict(precision=p, recall=r, f1=f, average_loss=lo) for item_name, item_value in items_for_summary.iteritems(): tmp = tf.summary.scalar(item_name, item_value) list_of_summaries.append(tmp) statistical_summary = tf.summary.merge(list_of_summaries) with tf.name_scope('images'): image_summary_length = int(6 * 16000 // 512) labels_uint8 = self.batch['label'][:, :image_summary_length, :] labels_uint8 = tf.cast(labels_uint8, tf.uint8) * 255 assert labels_uint8.dtype == tf.uint8 labels_uint8 = labels_uint8[..., None] _logits_left = tf.split(value=logits_stereo, num_or_size_splits=2, axis=0)[0] logits_prob_uint8 = tf.sigmoid(_logits_left[:, :image_summary_length, :]) logits_prob_uint8 = tf.cast(logits_prob_uint8 * 255., tf.uint8) logits_prob_uint8 = logits_prob_uint8[..., None] images = tf.concat([labels_uint8, logits_prob_uint8, tf.zeros_like(labels_uint8)], axis=-1) images = tf.transpose(images, [0, 2, 1, 3]) images.set_shape([None, 88, image_summary_length, 3]) image_summary = tf.summary.image('images', images) if self.is_training: with tf.name_scope('params'): var_summary_dict = dict() for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): var_summary_dict[var.op.name] = tf.summary.histogram(var.op.name, var) param_summary = tf.summary.merge(var_summary_dict.values()) if self.is_training: op_dict = dict( training_op=training_op, tb_summary=dict(statistics=statistical_summary, image=image_summary, parameter=param_summary), update_op_after_each_batch=update_op_after_each_batch, statistics_after_each_epoch=stats_after_each_epoch ) else: op_dict = dict( tb_summary=dict(statistics=statistical_summary, image=image_summary), update_op_after_each_batch=update_op_after_each_batch, statistics_after_each_epoch=stats_after_each_epoch ) self.op_dict = op_dict
def build_trainer(self, child_model): # actor child_model.build_valid_rl() self.valid_acc = (tf.to_float(child_model.valid_shuffle_acc) / tf.to_float(child_model.batch_size)) self.reward = self.valid_acc if self.use_critic: # critic all_h = tf.concat(self.all_h, axis=0) value_function = tf.matmul(all_h, self.w_critic) advantage = value_function - self.reward critic_loss = tf.reduce_sum(advantage**2) self.baseline = tf.reduce_mean(value_function) self.loss = -tf.reduce_mean(self.sample_log_probs * advantage) critic_train_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="critic_train_step") critic_train_op, _, _, _ = get_train_ops(critic_loss, [self.w_critic], critic_train_step, clip_mode=None, lr_init=1e-3, lr_dec_start=0, lr_dec_every=int(1e9), optim_algo="adam", sync_replicas=False) else: # or baseline self.sample_log_probs = tf.reduce_sum(self.sample_log_probs) self.baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False) baseline_update = tf.assign_sub(self.baseline, (1 - self.bl_dec) * (self.baseline - self.reward)) with tf.control_dependencies([baseline_update]): self.reward = tf.identity(self.reward) self.loss = self.sample_log_probs * (self.reward - self.baseline) self.train_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="train_step") tf_variables = [ var for var in tf.trainable_variables() if var.name.startswith(self.name) and "w_critic" not in var.name ] print("-" * 80) for var in tf_variables: print(var) self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops( self.loss, tf_variables, self.train_step, clip_mode=self.clip_mode, grad_bound=self.grad_bound, l2_reg=self.l2_reg, lr_init=self.lr_init, lr_dec_start=self.lr_dec_start, lr_dec_every=self.lr_dec_every, lr_dec_rate=self.lr_dec_rate, optim_algo=self.optim_algo, sync_replicas=self.sync_replicas, num_aggregate=self.num_aggregate, num_replicas=self.num_replicas) if self.use_critic: self.train_op = tf.group(self.train_op, critic_train_op)
def main(argv=None): m_cfg = sys_cfg() config = get_config(FLAGS) config.batch_size = FLAGS.batch_size_per_gpu * FLAGS.num_gpus config.num_layers = 3 config.num_steps = 5 # eval_config = get_config(FLAGS) eval_config.batch_size = 2 eval_config.num_layers = 3 eval_config.num_steps = 5 #============================ I. Model options ==============================# #>>>>>>>>>>>>>>>for PWCnet module network nn_opts = deepcopy(_DEFAULT_PWCNET_VAL_OPTIONS) if FLAGS.flownet_type is 'small': nn_opts['use_dense_cx'] = False nn_opts['use_res_cx'] = False nn_opts['pyr_lvls'] = 6 nn_opts['flow_pred_lvl'] = 2 nn_opts[ 'ckpt_path'] = '/work/cascades/lxiaol9/ARC/PWC/checkpoints/pwcnet-sm-6-2-multisteps-chairsthingsmix/pwcnet.ckpt-592000' # Model to eval else: nn_opts['use_dense_cx'] = True nn_opts['use_res_cx'] = True nn_opts['pyr_lvls'] = 6 nn_opts['flow_pred_lvl'] = 2 nn_opts[ 'ckpt_path'] = '/work/cascades/lxiaol9/ARC/PWC/checkpoints/pwcnet-lg-6-2-multisteps-chairsthingsmix/pwcnet.ckpt-595000' nn_opts['verbose'] = True nn_opts['batch_size'] = 32 # This is Batch_size per GPU(16*4/2/2 = 16) nn_opts[ 'use_tf_data'] = False # Don't use tf.data reader for this simple task nn_opts['gpu_devices'] = ['/device:GPU:0', '/device:GPU:1'] # nn_opts['controller'] = '/device:CPU:0' # Evaluate on CPU or GPU? nn_opts['adapt_info'] = (1, 436, 1024, 2) nn_opts['x_shape'] = [2, 512, 512, 3] # image pairs input shape [2, H, W, 3] nn_opts['y_shape'] = [512, 512, 2] # u,v flows output shape [H, W, 2] #>>>>>>>>>>>>>>>> For EAST module network east_opts = { 'verbose': True, 'ckpt_path': FLAGS.pretrained_model_path, 'batch_size': 40, 'batch_size_per_gpu': 20, 'gpu_devices': ['/device:GPU:0', '/device:GPU:1'], # controller device to put the model's variables on (usually, /cpu:0 or /gpu:0 -> try both!) 'controller': '/device:CPU:0', 'x_dtype': tf.float32, # image pairs input type 'x_shape': [512, 512, 3], # image pairs input shape [2, H, W, 3] 'y_score_shape': [128, 128, 1], # u,v flows output type 'y_geometry_shape': [128, 128, 5], # u,v flows output shape [H, W, 2] 'x_mask_shape': [128, 128, 1] } os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_list if not tf.gfile.Exists(FLAGS.checkpoint_path): tf.gfile.MkDir(FLAGS.checkpoint_path) else: if not FLAGS.restore: tf.gfile.DeleteRecursively(FLAGS.checkpoint_path) tf.gfile.MkDir(FLAGS.checkpoint_path) #=============================== II. building graph for east + agg =================================# # 1.1 Input placeholders batch_size = FLAGS.batch_size_per_gpu * FLAGS.num_gpus len_seq = FLAGS.num_steps # input_images = tf.placeholder(tf.float32, shape=[batch_size*len_seq, 512, 512, 3], name='input_images') input_feat_maps = tf.placeholder(tf.float32, shape=[batch_size, len_seq, 128, 128, 32], name='input_feature_maps') input_flow_maps = tf.placeholder( tf.float32, shape=[batch_size, len_seq - 1, 128, 128, 2], name='input_flow_maps') input_score_maps = tf.placeholder(tf.float32, shape=[batch_size, len_seq, 128, 128, 1], name='input_score_maps') if FLAGS.geometry == 'RBOX': input_geo_maps = tf.placeholder( tf.float32, shape=[batch_size, len_seq, 128, 128, 5], name='input_geo_maps') else: input_geo_maps = tf.placeholder( tf.float32, shape=[batch_size, len_seq, 128, 128, 8], name='input_geo_maps') input_training_masks = tf.placeholder( tf.float32, shape=[batch_size, len_seq, 128, 128, 1], name='input_training_masks') # 1.2 lr & opt global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, global_step, decay_steps=10000, decay_rate=0.8, staircase=True) opt = tf.train.AdamOptimizer(learning_rate) # 1.3 add summary tf.summary.scalar('learning_rate', learning_rate) # tf.summary.image('input_images', input_images[2:20:5, :, :, :]) # 1.4 build graph in tf # input_images_split = tf.split(input_images, FLAGS.num_gpus) input_feature_split = tf.split(input_feat_maps, FLAGS.num_gpus) input_score_maps_split = tf.split(input_score_maps, FLAGS.num_gpus) input_geo_maps_split = tf.split(input_geo_maps, FLAGS.num_gpus) input_training_masks_split = tf.split(input_training_masks, FLAGS.num_gpus) input_flow_maps_split = tf.split(input_flow_maps, FLAGS.num_gpus) tower_grads = [] reuse_variables = None tvars = [] gpus = list(range(len(FLAGS.gpu_list.split(',')))) for i, gpu_id in enumerate(gpus): with tf.device('/gpu:%d' % gpu_id): with tf.name_scope('model_%d' % gpu_id) as scope: iis = input_feature_split[i] ifms = input_flow_maps_split[i] isms = input_score_maps_split[i] igms = input_geo_maps_split[i] itms = input_training_masks_split[i] # model changed to recurrent one, we only need the recurrent loss returned total_loss, model_loss = model_gru_agg.tower_loss( iis, ifms, isms, igms, itms, gpu_id=gpu_id, config=config, reuse_variables=reuse_variables) batch_norm_updates_op = tf.group( *tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope)) reuse_variables = True # tvar1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='tiny_embed') # tvar2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='pred_module') # tvars = tvar1 + tvar2 grads = opt.compute_gradients(total_loss) tower_grads.append(grads) # 1.5 gradient parsering grads = average_gradients(tower_grads) apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # 1.6 get training operations summary_op = tf.summary.merge_all() # variable_averages = tf.train.ExponentialMovingAverage( # FLAGS.moving_average_decay, global_step) # variables_averages_op = variable_averages.apply(tf.trainable_variables()) with tf.control_dependencies([apply_gradient_op, batch_norm_updates_op]): train_op = tf.no_op(name='train_op') # 1.8 Saver & Session & Restore saver = tf.train.Saver(tf.global_variables()) # sv = tf.train.Supervisor() summary_writer = tf.summary.FileWriter(FLAGS.checkpoint_path, tf.get_default_graph()) init = tf.global_variables_initializer() g = tf.get_default_graph() with g.as_default(): config1 = tf.ConfigProto() config1.gpu_options.allow_growth = True config1.allow_soft_placement = True sess1 = tf.Session(config=config1) if FLAGS.restore: print('continue training from previous checkpoint') ckpt = FLAGS.prev_checkpoint_path saver.restore(sess1, ckpt) else: sess1.run(init) # var_list1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='multi_rnn_cell') # var_list2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='pred_module') # var_list_part1 = var_list1 + var_list2 # saver_alter1 = tf.train.Saver({v.op.name: v for v in var_list_part1}) # # var_list3 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='tiny_embed') # # var_list4 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='pred_module') # # var_list_part2 = var_list3 + var_list4 # # saver_alter2 = tf.train.Saver({v.op.name: v for v in var_list_part2}) # print('continue training from previous weights') # ckpt1 = FLAGS.prev_checkpoint_path # print('Restore from {}'.format(ckpt1)) # saver_alter1.restore(sess1, ckpt1) # # print('continue training from previous Flow weights') # # ckpt2 = FLAGS.prev_checkpoint_path # # print('Restore from {}'.format(ckpt2)) # # saver_alter2.restore(sess1, ckpt2) #============================= III. Other necessary componets before training =============================# print("Step 1: AGG model has been reconstructed") GPUtil.showUtilization() # >>>>>>>>>>>>>>>>>>>>>>>>>>>>> EAST model >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # east_net = model_flow_east.EAST(mode='test', options=east_opts) print("Step 2: EAST model has been reconstructed") GPUtil.showUtilization() # >>>>>>>>>>>>>>>>>>>>>>>>>>>>> PWCnet model >>>>>>>>>>>>>>>>>>>>>>>>>>>>>># nn = ModelPWCNet(mode='test', options=nn_opts) print("Step 3: PWC model has been reconstructed") GPUtil.showUtilization() train_data_generator = icdar_light.get_batch_seq( num_workers=FLAGS.num_readers, config=config, is_training=True) # val_data_generator = icdar.get_batch_seq(num_workers=FLAGS.num_readers, config=eval_config, is_training=False) start = time.time() #============================= IV. Training over Steps(!!!)================================================# print("Now we're starting training!!!") for step in range(FLAGS.max_steps): #>>>>>>>>>>>>> data if FLAGS.mode == "debug": data = [] data.append( np.ones((config.batch_size, FLAGS.num_steps, 512, 512, 3), dtype=np.float32)) data.append( np.ones((batch_size, len_seq, 128, 128, 1), dtype=np.float32)) data.append( np.ones((batch_size, len_seq, 128, 128, 5), dtype=np.float32)) data.append( np.ones((batch_size, len_seq, 128, 128, 1), dtype=np.float32)) else: data = next(train_data_generator) if step < 3: print("Data ready!!!") east_feed = np.reshape(data[0], [-1, 512, 512, 3]) target_frame = np.reshape( np.array(data[0])[:, 0:4, :, :, :], [-1, 512, 512, 3]) source_frame = np.reshape( np.array(data[0])[:, 1:5, :, :, :], [-1, 512, 512, 3]) flow_feed = np.concatenate((source_frame[:, np.newaxis, :, :, :], target_frame[:, np.newaxis, :, :, :]), axis=1) flow_maps_stack = [] # >>>>>>>>>>>>>>>>>>>>>>>>>>> feature extraction with EAST >>>>>>>>>>>>>>>>>>>>>>>> # rounds = int(east_feed.shape[0] / east_opts['batch_size']) feature_stack = [] flow_maps_stack = [] for r in range(rounds): feature_stack.append( east_net.sess.run( [east_net.y_hat_test_tnsr], feed_dict={ east_net.x_tnsr: east_feed[r * east_opts['batch_size']:(r + 1) * east_opts['batch_size'], :, :, :] })[0][0]) feature_maps = np.concatenate(feature_stack, axis=0) feature_maps_reshape = np.reshape(feature_maps, [-1, config.num_steps, 128, 128, 32]) #>>>>>>>>>>>>>>> flow estimation with PWCnet # x: [batch_size,2,H,W,3] uint8; x_adapt: [batch_size,2,H,W,3] float32 x_adapt, x_adapt_info = nn.adapt_x(flow_feed) if x_adapt_info is not None: y_adapt_info = (x_adapt_info[0], x_adapt_info[2], x_adapt_info[3], 2) else: y_adapt_info = None mini_batch = nn_opts['batch_size'] * nn.num_gpus rounds = int(flow_feed.shape[0] / mini_batch) for r in range(rounds): feed_dict = { nn.x_tnsr: x_adapt[r * mini_batch:(r + 1) * mini_batch, :, :, :, :] } y_hat = nn.sess.run(nn.y_hat_test_tnsr, feed_dict=feed_dict) if FLAGS.mode == "debug": print( "Step 5: now finish running one round of PWCnet for flow estimation" ) GPUtil.showUtilization() y_hats, _ = nn.postproc_y_hat_test( y_hat, y_adapt_info) # suppose to be [batch, height, width, 2] flow_maps_stack.append(y_hats[:, 1::4, 1::4, :] / 4) flow_maps = np.concatenate(flow_maps_stack, axis=0) print("flow maps has shape ", flow_maps.shape[:]) flow_maps = np.reshape(flow_maps, [-1, FLAGS.num_steps - 1, 128, 128, 2]) #>>>>>>>>>>>>>>> running training session with g.as_default(): ml, tl, _ = sess1.run([model_loss, total_loss, train_op], \ feed_dict={input_feat_maps: feature_maps_reshape, input_score_maps: data[1], input_geo_maps: data[2], input_training_masks: data[3], input_flow_maps: flow_maps }) if FLAGS.mode == "debug": print("Step 6: running one round on training!!!") GPUtil.showUtilization() if np.isnan(tl): print('Loss diverged, stop training') break if step % 10 == 0: avg_time_per_step = (time.time() - start) / 10 avg_examples_per_second = (10 * FLAGS.batch_size_per_gpu * len(gpus)) / (time.time() - start) start = time.time() print( 'Step {:06d}, model loss {:.4f}, total loss {:.4f}, {:.2f} seconds/step, {:.2f} examples/second' .format(step, ml, tl, avg_time_per_step, avg_examples_per_second)) if step % FLAGS.save_checkpoint_steps == 0: saver.save(sess1, FLAGS.checkpoint_path + 'model.ckpt', global_step=global_step) if step % FLAGS.save_summary_steps == 0: _, tl, summary_str = sess1.run( [train_op, total_loss, summary_op], feed_dict={ input_feat_maps: feature_maps_reshape, input_score_maps: data[1], input_geo_maps: data[2], input_training_masks: data[3], input_flow_maps: flow_maps }) summary_writer.add_summary(summary_str, global_step=step)
def k_m_tf(defect_tensor, clusters, max_iters, summaries_dir, stage_str, name_str, go_to_max=False): length = len(defect_tensor[:, 0]) num_clus = clusters MAX_ITERS = max_iters tiles = len(defect_tensor[0, :]) start = time.time() sess = tf.InteractiveSession() with tf.name_scope('input'): points = tf.Variable(tf.random_uniform([length, tiles]), dtype=tf.float32) with tf.name_scope('cluster_assigns'): cluster_assignments = tf.Variable(tf.zeros([length], dtype=tf.float32)) with tf.name_scope('cents'): centroids = tf.Variable(tf.random_crop(points.initialized_value(), [num_clus, tiles]), dtype=tf.float32) # centroids = tf.Print(centroids,[centroids], summarize = 16, message = 'centroids') # Replicate to N copies of each centroid and K copies of each # point, then subtract and compute the sum of squared distances. with tf.name_scope('Replicate'): rep_centroids = tf.reshape(tf.tile(centroids, [length, 1]), [length, num_clus, tiles]) # rep_centroids = tf.Print(rep_centroids,[tf.shape(rep_centroids)],message='shape_rep_centroids') rep_points = tf.reshape(tf.tile(points, [1, num_clus]), [length, num_clus, tiles]) with tf.name_scope('Sum_squares'): squares = tf.square(rep_points - rep_centroids) sum_squares = tf.reduce_sum(tf.square(squares), reduction_indices=2) squares_1d = tf.scalar_summary('sum_squares', tf.reduce_mean(sum_squares)) # sum_squares = tf.Print(sum_squares,[sum_squares], summarize = 40, message = 'sum_squares') # sum_squares = tf.Print(sum_squares,[tf.shape(sum_squares)], summarize = 16, message = 'sum_squares_shape') # Use argmin to select the lowest-distance point with tf.name_scope('argmin'): best_centroids = tf.argmin(sum_squares, 1) # best_centroids = tf.Print(best_centroids,[best_centroids], summarize = 40, message = ' best_cents') did_assignments_change = tf.reduce_any( tf.not_equal(tf.cast(best_centroids, tf.float32), cluster_assignments)) ## This part exists for counting purposes, since I can't simply access the count in the means part with tf.name_scope('counting'): const_1d = {} num_1d = {} found_1d = {} scalar_1d = {} for i in range(0, num_clus): const_1d[i] = tf.constant(i, shape=[320, 1], dtype=tf.int64) # string_1d[i] = tf.constant(str[i], shape =[320,1], dtype = tf.string) for i in range(0, num_clus): num_1d[i] = tf.equal(tf.reshape(best_centroids, [320, 1]), const_1d[i]) found_1d[i] = tf.reduce_sum(tf.cast(num_1d[i], tf.int32)) found_1d[i] = tf.expand_dims(found_1d[i], -1) scalar_1d[i] = tf.scalar_summary(str(i), tf.squeeze(found_1d[i])) # found_1d[i] = tf.Print(found_1d[i], [found_1d[i]], summarize=40, message=str(i)) # found_1d[i] = tf.Print(found_1d[i], [tf.shape(found_1d[i])], summarize=40, message=str(i)) # found_1d[i] = tf.Print(found_1d[i],[tf.expand_dims(found_1d[i],0)], summarize = 40, message =str(i)) # found_1d[i] = tf.Print(found_1d[i],[tf.shape(tf.expand_dims(found_1d[i],0))], summarize = 40, message =str(i)) # found_1d[i] = tf.Print(found_1d[i], [tf.shape(tf.reshape(found_1d[i],[1,1]))], summarize=40, message=str(i)) found_tensor = tf.concat(0, [found_1d[i] for i in range(0, num_clus)]) distro = tf.histogram_summary('Distribution', found_tensor) ## calculate the means at the indices of best_centroids. with tf.name_scope('means'): total = tf.unsorted_segment_sum(points, best_centroids, num_clus) count = tf.unsorted_segment_sum(tf.ones_like(points), best_centroids, num_clus) # count = tf.Print(count, [tf.shape(count)]) means = total / count means = tf.select(tf.is_nan(means), tf.ones_like(means) * 0, means) means_1d = tf.scalar_summary('means', tf.reduce_mean(means)) # means = tf.Print(means,[means],summarize = 16, message = 'MEANS') # means = tf.Print(means,[tf.shape(means)], message = 'm_shape') # Do not write to the assigned clusters variable until after # computing whether the assignments have changed - hence with_dependencies with tf.name_scope('Do_updates'): with tf.control_dependencies([did_assignments_change]): do_updates = tf.group( centroids.assign(means), cluster_assignments.assign(tf.cast(best_centroids, tf.float32))) changed = True iters = 0 found_numerical = {} # found_1d = tf.Print(found_1d,[found_1d]) # Merge summaries scalar_summary = tf.merge_summary( [scalar_1d[i] for i in range(0, num_clus)]) other_summary = tf.merge_summary([means_1d, squares_1d]) histogram_summary = tf.merge_summary([distro]) writer = tf.train.SummaryWriter( summaries_dir + '/' + stage_str + '/kmeans/' + name_str, sess.graph) init = tf.initialize_all_variables() sess.run(init) # loop # check for assignment changes and assign new based on new means. If assignments didnt change, stop. while changed and iters < MAX_ITERS: iters += 1 run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() # if iters%10 == 1: [changed, _, histogram_sum_run, scalar_sum_run, other_sum_run] = sess.run([ did_assignments_change, do_updates, histogram_summary, scalar_summary, other_summary ], feed_dict={points: defect_tensor}) writer.add_run_metadata(run_metadata, 'step%03d' % iters) writer.add_summary(histogram_sum_run, iters) writer.add_summary(scalar_sum_run, iters) writer.add_summary(other_sum_run, iters) # else: # [changed, _, scalar_sum_run] = sess.run([did_assignments_change, do_updates, scalar_summary], feed_dict={points: defect_tensor}) # writer.add_run_metadata(run_metadata, 'step%03d' % iters) # writer.add_summary(scalar_sum_run, iters) ## Note: due to the interconnectivity of found_1d, it seems as you need to run it ALONG the session a couple lines before in order to get numerical results ## Can't do that in a seperate run. Weirdly enough it works for found_tensor, which is simply a concat of found_1d. I don't know why. # found_numerical[0] = sess.run([found_1d[0]], feed_dict={points:defect_tensor}) found_numerical[1] = sess.run([found_1d[1]], feed_dict={points: defect_tensor}) found_numerical[3] = sess.run([found_1d[3]], feed_dict={points: defect_tensor}) found_numerical[4] = sess.run([found_1d[4]], feed_dict={points: defect_tensor}) if go_to_max == True: changed = True writer.close() [centers, assignments] = sess.run([centroids, cluster_assignments]) end = time.time() print("Found in %.2f seconds" % (end - start), iters, "iterations") print('Distribution:', sess.run(found_tensor, feed_dict={points: defect_tensor})) tf.reset_default_graph() sess.close() return centers, assignments
def train(submit_config: dnnlib.SubmitConfig, iteration_count: int, eval_interval: int, minibatch_size: int, learning_rate: float, ramp_down_perc: float, noise: dict, validation_config: dict, train_tfrecords: str, noise2noise: bool): noise_augmenter = dnnlib.util.call_func_by_name(**noise) validation_set = ValidationSet(submit_config) validation_set.load(**validation_config) # Create a run context (hides low level details, exposes simple API to manage the run) ctx = dnnlib.RunContext(submit_config, config) # Initialize TensorFlow graph and session using good default settings tfutil.init_tf(config.tf_config) dataset_iter = create_dataset(train_tfrecords, minibatch_size, noise_augmenter.add_train_noise_tf) # Construct the network using the Network helper class and a function defined in config.net_config with tf.device("/gpu:0"): net = tflib.Network(**config.net_config) # Optionally print layer information net.print_layers() print('Building TensorFlow graph...') with tf.name_scope('Inputs'), tf.device("/cpu:0"): lrate_in = tf.compat.v1.placeholder(tf.float32, name='lrate_in', shape=[]) #print("DEBUG train:", "dataset iter got called") noisy_input, noisy_target, clean_target = dataset_iter.get_next() noisy_input_split = tf.split(noisy_input, submit_config.num_gpus) noisy_target_split = tf.split(noisy_target, submit_config.num_gpus) print(len(noisy_input_split), noisy_input_split) clean_target_split = tf.split(clean_target, submit_config.num_gpus) # Split [?, 3, 256, 256] across num_gpus over axis 0 (i.e. the batch) # Define the loss function using the Optimizer helper class, this will take care of multi GPU opt = tflib.Optimizer(learning_rate=lrate_in, **config.optimizer_config) radii = np.arange(128).reshape(128, 1) #image size 256, binning = 3 radial_masks = np.apply_along_axis(radial_mask, 1, radii, 128, 128, np.arange(0, 256), np.arange(0, 256), 20) print("RN SHAPE!!!!!!!!!!:", radial_masks.shape) radial_masks = np.expand_dims(radial_masks, 1) # (128, 1, 256, 256) #radial_masks = np.squeeze(np.stack((radial_masks,) * 3, -1)) # 43, 3, 256, 256 #radial_masks = radial_masks.transpose([0, 3, 1, 2]) radial_masks = radial_masks.astype(np.complex64) radial_masks = tf.expand_dims(radial_masks, 1) rn = tf.compat.v1.placeholder_with_default(radial_masks, [128, None, 1, 256, 256]) rn_split = tf.split(rn, submit_config.num_gpus, axis=1) freq_nyq = int(np.floor(int(256) / 2.0)) spatial_freq = radii.astype(np.float32) / freq_nyq spatial_freq = spatial_freq / max(spatial_freq) for gpu in range(submit_config.num_gpus): with tf.device("/gpu:%d" % gpu): net_gpu = net if gpu == 0 else net.clone() denoised_1 = net_gpu.get_output_for(noisy_input_split[gpu]) denoised_2 = net_gpu.get_output_for(noisy_target_split[gpu]) print(noisy_input_split[gpu].get_shape(), rn_split[gpu].get_shape()) if noise2noise: meansq_error = fourier_ring_correlation( noisy_target_split[gpu], denoised_1, rn_split[gpu], spatial_freq) - fourier_ring_correlation( noisy_target_split[gpu] - denoised_2, noisy_input_split[gpu] - denoised_1, rn_split[gpu], spatial_freq) else: meansq_error = tf.reduce_mean( tf.square(clean_target_split[gpu] - denoised)) # Create an autosummary that will average over all GPUs #tf.summary.histogram(name, var) with tf.control_dependencies([autosummary("Loss", meansq_error)]): opt.register_gradients(meansq_error, net_gpu.trainables) train_step = opt.apply_updates() # Create a log file for Tensorboard summary_log = tf.compat.v1.summary.FileWriter(submit_config.run_dir) summary_log.add_graph(tf.compat.v1.get_default_graph()) print('Training...') time_maintenance = ctx.get_time_since_last_update() ctx.update(loss='run %d' % submit_config.run_id, cur_epoch=0, max_epoch=iteration_count) # The actual training loop for i in range(iteration_count): # Whether to stop the training or not should be asked from the context if ctx.should_stop(): break # Dump training status if i % eval_interval == 0: time_train = ctx.get_time_since_last_update() time_total = ctx.get_time_since_start() print("DEBUG TRAIN!", noisy_input.dtype, noisy_input[0][0].dtype) # Evaluate 'x' to draw a batch of inputs [source_mb, target_mb] = tfutil.run([noisy_input, clean_target]) denoised = net.run(source_mb) save_image(submit_config, denoised[0], "img_{0}_y_pred.tif".format(i)) save_image(submit_config, target_mb[0], "img_{0}_y.tif".format(i)) save_image(submit_config, source_mb[0], "img_{0}_x_aug.tif".format(i)) validation_set.evaluate(net, i, noise_augmenter.add_validation_noise_np) print( 'iter %-10d time %-12s sec/eval %-7.1f sec/iter %-7.2f maintenance %-6.1f' % (autosummary('Timing/iter', i), dnnlib.util.format_time( autosummary('Timing/total_sec', time_total)), autosummary('Timing/sec_per_eval', time_train), autosummary('Timing/sec_per_iter', time_train / eval_interval), autosummary('Timing/maintenance_sec', time_maintenance))) dnnlib.tflib.autosummary.save_summaries(summary_log, i) ctx.update(loss='run %d' % submit_config.run_id, cur_epoch=i, max_epoch=iteration_count) time_maintenance = ctx.get_last_update_interval() - time_train save_snapshot(submit_config, net, str(i)) lrate = compute_ramped_down_lrate(i, iteration_count, ramp_down_perc, learning_rate) tfutil.run([train_step], {lrate_in: lrate}) print("Elapsed time: {0}".format( util.format_time(ctx.get_time_since_start()))) save_snapshot(submit_config, net, 'final') # Summary log and context should be closed at the end summary_log.close() ctx.close()
train_state = True val_state = False network = build_nukev8(inputs=net_input, num_classes=num_classes, is_training=model_mode, dropout_p=dp) reg_loss = tf.add_n(tf.losses.get_regularization_losses()) net_output = tf.cast(net_output, tf.float64) loss = tf.reduce_mean( tf.losses.softmax_cross_entropy(logits=network, onehot_labels=net_output)) + reg_loss update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) print('finish update ops') with tf.control_dependencies(update_ops): optimizer = tf.train.RMSPropOptimizer( learning_rate=0.0001, decay=0.995).minimize( loss, var_list=[var for var in tf.trainable_variables()]) saver = tf.train.Saver(max_to_keep=1000) sess.run(tf.global_variables_initializer()) # If a pre-trained ResNet is required, load the weights. # This must be done AFTER the variables are initialized with sess.run(tf.global_variables_initializer()) # Load a previous checkpoint if desired model_checkpoint_name = "./latest_model" + ".ckpt" if args.continue_training: print('Loaded latest model checkpoint') #saver.restore(sess, "079_trnlss_loss:0.228071_iou:0.695515_momentum_gnorm_/model.ckpt") saver.restore(sess, model_checkpoint_name)
def _random_crop(image_list, crop_height, crop_width): """Crops the given list of images. The function applies the same crop to each image in the list. This can be effectively applied when there are multiple image inputs of the same dimension such as: image, depths, normals = _random_crop([image, depths, normals], 120, 150) Args: image_list: a list of image tensors of the same dimension but possibly varying channel. crop_height: the new height. crop_width: the new width. Returns: the image_list with cropped images. Raises: ValueError: if there are multiple image inputs provided with different size or the images are smaller than the crop dimensions. """ if not image_list: raise ValueError('Empty image_list.') # Compute the rank assertions. rank_assertions = [] for i in range(len(image_list)): image_rank = tf.rank(image_list[i]) rank_assert = tf.Assert( tf.equal(image_rank, 3), ['Wrong rank for tensor %s [expected] [actual]', image_list[i].name, 3, image_rank]) rank_assertions.append(rank_assert) with tf.control_dependencies([rank_assertions[0]]): image_shape = tf.shape(image_list[0]) image_height = image_shape[0] image_width = image_shape[1] crop_size_assert = tf.Assert( tf.logical_and( tf.greater_equal(image_height, crop_height), tf.greater_equal(image_width, crop_width)), ['Crop size greater than the image size.']) asserts = [rank_assertions[0], crop_size_assert] for i in range(1, len(image_list)): image = image_list[i] asserts.append(rank_assertions[i]) with tf.control_dependencies([rank_assertions[i]]): shape = tf.shape(image) height = shape[0] width = shape[1] height_assert = tf.Assert( tf.equal(height, image_height), ['Wrong height for tensor %s [expected][actual]', image.name, height, image_height]) width_assert = tf.Assert( tf.equal(width, image_width), ['Wrong width for tensor %s [expected][actual]', image.name, width, image_width]) asserts.extend([height_assert, width_assert]) # Create a random bounding box. # # Use tf.random_uniform and not numpy.random.rand as doing the former would # generate random numbers at graph eval time, unlike the latter which # generates random numbers at graph definition time. with tf.control_dependencies(asserts): max_offset_height = tf.reshape(image_height - crop_height + 1, []) with tf.control_dependencies(asserts): max_offset_width = tf.reshape(image_width - crop_width + 1, []) offset_height = tf.random_uniform( [], maxval=max_offset_height, dtype=tf.int32) offset_width = tf.random_uniform( [], maxval=max_offset_width, dtype=tf.int32) return [_crop(image, offset_height, offset_width, crop_height, crop_width) for image in image_list]
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add embeddings for `token_type_ids`. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Must be specified if `use_token_type` is True. token_type_vocab_size: int. The vocabulary size of `token_type_ids`. token_type_embedding_name: string. The name of the embedding table variable for token type ids. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_token_type: if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output
def train(model, data, batch_size=128, learning_rate=FLAGS.learning_rate, log_dir='./log', checkpoint_dir='./checkpoint', num_epochs=-1): # tf Graph input with tf.device('/cpu:0'): with tf.name_scope('data'): if FLAGS.dataset == "imagenet": x, yt = image_processing.distorted_inputs( data, batch_size=batch_size, num_preprocess_threads=FLAGS.num_threads) else: x, yt = data.generate_batches(batch_size, num_threads=FLAGS.num_threads) global_step = tf.get_variable('global_step', shape=[], dtype=tf.int64, initializer=tf.constant_initializer(0), trainable=False) if FLAGS.gpu: device_str = '/gpu:' + str(FLAGS.device) else: device_str = '/cpu:0' with tf.device(device_str): y = model(x, is_training=True) # Define loss and optimizer with tf.name_scope('objective'): loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=yt, logits=y)) tf.contrib.quantize.create_training_graph() with tf.name_scope('objective'): accuracy = tf.reduce_mean( tf.cast(tf.nn.in_top_k(y, yt, 1), tf.float32)) opt = tf.contrib.layers.optimize_loss( loss, global_step, learning_rate, 'Adam', gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, #moving_average_decay=0.9, learning_rate_decay_fn=learning_rate_decay_fn if FLAGS.using_learning_rate_decay_fn else None, update_ops=None, variables=None, name=None) #grads = opt.compute_gradients(loss) #apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # loss_avg ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step, name='average') ema_op = ema.apply([loss, accuracy] + tf.trainable_variables()) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op) loss_avg = ema.average(loss) tf.summary.scalar('loss/training', loss_avg) accuracy_avg = ema.average(accuracy) tf.summary.scalar('accuracy/training', accuracy_avg) check_loss = tf.check_numerics(loss, 'model diverged: loss->nan') tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, check_loss) updates_collection = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies([opt]): train_op = tf.group(*updates_collection) if FLAGS.summary: add_summaries(scalar_list=[accuracy, accuracy_avg, loss, loss_avg], activation_list=tf.get_collection( tf.GraphKeys.ACTIVATIONS), var_list=tf.trainable_variables()) # grad_list=grads) summary_op = tf.summary.merge_all() # Configure options for session gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.InteractiveSession(config=tf.ConfigProto( log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options, )) if FLAGS.resume: logging.info('resuming from ' + checkpoint_dir) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(checkpoint_dir + '/') if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found') return #print sess.run('global_step:0') #print global_step.eval() else: saver = tf.train.Saver(max_to_keep=5) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) num_batches = data.size[0] / batch_size summary_writer = tf.summary.FileWriter(log_dir, graph=sess.graph) epoch = global_step.eval() / num_batches if FLAGS.resume else 0 display_interval = FLAGS.display_interval or num_batches / 10 test_interval = FLAGS.test_interval or num_batches / 2 logging.info('num of trainable paramaters: %d' % count_params(tf.trainable_variables())) while epoch != num_epochs: curr_step = 0 # Initializing the variables #with tf.Session() as session: # print(session.run(ww)) logging.info('Started epoch %d' % epoch) bar = Bar('Training', max=num_batches, suffix='%(percent)d%% eta: %(eta)ds') while curr_step < data.size[0]: _, loss_val, step = sess.run([train_op, loss, global_step]) # if step%display_interval==0: # step, acc_value, loss_value, summary = sess.run( # [global_step, accuracy_avg, loss_avg, summary_op]) # logging.info("step %d loss %.3f accuracy %.3f" %(step,loss_value,acc_value)) # summary_out = tf.Summary() # summary_out.ParseFromString(summary) # summary_writer.add_summary(summary_out, step) # summary_writer.flush() # if step%test_interval==0: # saver.save(sess, save_path=checkpoint_dir + # '/model.ckpt', global_step=global_step) # test_top1,test_top5,test_loss = evaluate(model, FLAGS.dataset, # batch_size=batch_size, # checkpoint_dir=checkpoint_dir) # logging.info('Test loss %.3f Test top1 %.3f Test top5 %.3f' % (test_loss,test_top1,test_top5)) # summary_out = tf.Summary() # summary_out.ParseFromString(summary) # summary_out.value.add(tag='accuracy/test_top1', simple_value=test_top1) # summary_out.value.add(tag='accuracy/test_top5', simple_value=test_top5) # summary_out.value.add(tag='loss/test', simple_value=test_loss) # summary_writer.add_summary(summary_out, step) # summary_writer.flush() curr_step += FLAGS.batch_size bar.next() bar.finish() step, acc_value, loss_value, summary = sess.run( [global_step, accuracy_avg, loss_avg, summary_op]) saver.save(sess, save_path=checkpoint_dir + '/model.ckpt', global_step=global_step) test_top1, test_top5, test_loss = evaluate( model, FLAGS.dataset, batch_size=batch_size, checkpoint_dir=checkpoint_dir) logging.info('Test loss %.3f Test top1 %.3f Test top5 %.3f' % (test_loss, test_top1, test_top5)) summary_out = tf.Summary() summary_out.ParseFromString(summary) summary_out.value.add(tag='accuracy/test_top1', simple_value=test_top1) summary_out.value.add(tag='accuracy/test_top5', simple_value=test_top5) summary_out.value.add(tag='loss/test', simple_value=test_loss) summary_writer.add_summary(summary_out, step) summary_writer.flush() logging.info("Finished epoch %d " % epoch) epoch += 1 # When done, ask the threads to stop. coord.request_stop() coord.join(threads) coord.clear_stop() summary_writer.close()
def run( self, *in_arrays, return_as_list=False, # True = return a list of NumPy arrays, False = return a single NumPy array, or a tuple if there are multiple outputs. print_progress=True, # Print progress to the console? Useful for very large input arrays. minibatch_size=None, # Maximum minibatch size to use, None = disable batching. num_gpus=1, # Number of GPUs to use. out_mul=1.0, # Multiplicative constant to apply to the output(s). out_add=0.0, # Additive constant to apply to the output(s). out_shrink=1, # Shrink the spatial dimensions of the output(s) by the given factor. out_dtype=None, # Convert the output to the specified data type. **dynamic_kwargs ): # Additional keyword arguments to pass into the network construction function. # assert len(in_arrays) == self.num_inputs num_items = in_arrays[0].shape[0] if minibatch_size is None: minibatch_size = num_items key = str([ list(sorted(dynamic_kwargs.items())), num_gpus, out_mul, out_add, out_shrink, out_dtype ]) # Build graph. if key not in self._run_cache: with absolute_name_scope(self.scope + '/Run'), tf.control_dependencies(None): in_split = list( zip(*[tf.split(x, num_gpus) for x in self.input_templates])) out_split = [] for gpu in range(num_gpus): with tf.device('/gpu:%d' % gpu): out_expr = self.get_output_for(*in_split[gpu], return_as_list=True, **dynamic_kwargs) if out_mul != 1.0: out_expr = [x * out_mul for x in out_expr] if out_add != 0.0: out_expr = [x + out_add for x in out_expr] if out_shrink > 1: ksize = [1, 1, out_shrink, out_shrink] out_expr = [ tf.nn.avg_pool(x, ksize=ksize, strides=ksize, padding='VALID', data_format='NCHW') for x in out_expr ] if out_dtype is not None: if tf.as_dtype(out_dtype).is_integer: out_expr = [tf.round(x) for x in out_expr] out_expr = [ tf.saturate_cast(x, out_dtype) for x in out_expr ] out_split.append(out_expr) self._run_cache[key] = [ tf.concat(outputs, axis=0) for outputs in zip(*out_split) ] # Run minibatches. out_expr = self._run_cache[key] out_arrays = [ np.empty([num_items] + shape_to_list(expr.shape)[1:], expr.dtype.name) for expr in out_expr ] for mb_begin in range(0, num_items, minibatch_size): if print_progress: print('\r%d / %d' % (mb_begin, num_items), end='') mb_end = min(mb_begin + minibatch_size, num_items) mb_in = [src[mb_begin:mb_end] for src in in_arrays] mb_out = tf.get_default_session().run( out_expr, dict(zip(self.input_templates, mb_in))) for dst, src in zip(out_arrays, mb_out): dst[mb_begin:mb_end] = src # Done. if print_progress: print('\r%d / %d' % (num_items, num_items)) if not return_as_list: out_arrays = out_arrays[0] if len(out_arrays) == 1 else tuple( out_arrays) return out_arrays
def train(train_record_file, train_log_step, train_param, val_record_file, val_log_step, labels_nums, data_shape, snapshot, snapshot_prefix): ''' :param train_record_file: 训练的tfrecord文件 :param train_log_step: 显示训练过程log信息间隔 :param train_param: train参数 :param val_record_file: 验证的tfrecord文件 :param val_log_step: 显示验证过程log信息间隔 :param val_param: val参数 :param labels_nums: labels数 :param data_shape: 输入数据shape :param snapshot: 保存模型间隔 :param snapshot_prefix: 保存模型文件的前缀名 :return: ''' [base_lr,max_steps]=train_param [batch_size,resize_height,resize_width,depths]=data_shape # 获得训练和测试的样本数 train_nums=get_example_nums(train_record_file) val_nums=get_example_nums(val_record_file) print('train nums:%d,val nums:%d'%(train_nums,val_nums)) # 从record中读取图片和labels数据 # train数据,训练数据一般要求打乱顺序shuffle=True train_images, train_labels = read_records(train_record_file, resize_height, resize_width, type='normalization') train_images_batch, train_labels_batch = get_batch_images(train_images, train_labels, batch_size=batch_size, labels_nums=labels_nums, one_hot=True, shuffle=True) # val数据,验证数据可以不需要打乱数据 val_images, val_labels = read_records(val_record_file, resize_height, resize_width, type='normalization') val_images_batch, val_labels_batch = get_batch_images(val_images, val_labels, batch_size=batch_size, labels_nums=labels_nums, one_hot=True, shuffle=False) # Define the model: with slim.arg_scope(inception_v3.inception_v3_arg_scope()): out, end_points = inception_v3.inception_v3(inputs=input_images, num_classes=labels_nums, dropout_keep_prob=keep_prob, is_training=is_training) # Specify the loss function: tf.losses定义的loss函数都会自动添加到loss函数,不需要add_loss()了 tf.losses.softmax_cross_entropy(onehot_labels=input_labels, logits=out)#添加交叉熵损失loss=1.6 # slim.losses.add_loss(my_loss) loss = tf.losses.get_total_loss(add_regularization_losses=True)#添加正则化损失loss=2.2 # Specify the optimization scheme: optimizer = tf.train.GradientDescentOptimizer(learning_rate=base_lr) # global_step = tf.Variable(0, trainable=False) # learning_rate = tf.train.exponential_decay(0.05, global_step, 150, 0.9) # # optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) # # train_tensor = optimizer.minimize(loss, global_step) # train_op = slim.learning.create_train_op(loss, optimizer,global_step=global_step) # 在定义训练的时候, 注意到我们使用了`batch_norm`层时,需要更新每一层的`average`和`variance`参数, # 更新的过程不包含在正常的训练过程中, 需要我们去手动像下面这样更新 # 通过`tf.get_collection`获得所有需要更新的`op` update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # 使用`tensorflow`的控制流, 先执行更新算子, 再执行训练 with tf.control_dependencies(update_ops): # create_train_op that ensures that when we evaluate it to get the loss, # the update_ops are done and the gradient updates are computed. # train_op = slim.learning.create_train_op(total_loss=loss,optimizer=optimizer) train_op = slim.learning.create_train_op(total_loss=loss, optimizer=optimizer) accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(out, 1), tf.argmax(input_labels, 1)), tf.float32)) # 循环迭代过程 step_train(train_op, loss, accuracy, train_images_batch, train_labels_batch, train_nums, train_log_step, val_images_batch, val_labels_batch, val_nums, val_log_step, snapshot_prefix, snapshot)
def _build_ops(self, lm_graph): with tf.control_dependencies([lm_graph.update_state_op]): # get the LM embeddings token_embeddings = lm_graph.embedding layers = [tf.concat([token_embeddings, token_embeddings], axis=2)] n_lm_layers = len(lm_graph.lstm_outputs['forward']) for i in range(n_lm_layers): layers.append( tf.concat([ lm_graph.lstm_outputs['forward'][i], lm_graph.lstm_outputs['backward'][i] ], axis=-1)) # The layers include the BOS/EOS tokens. Remove them sequence_length_wo_bos_eos = lm_graph.sequence_lengths - 2 layers_without_bos_eos = [] # lm_graph.sequence_lengths = tf.Print(lm_graph.sequence_lengths, [lm_graph.sequence_lengths], message='seq len') for layer in layers: layer_wo_bos_eos = layer[:, 1:, :] layer_wo_bos_eos = tf.reverse_sequence( layer_wo_bos_eos, lm_graph.sequence_lengths - 1, seq_axis=1, batch_axis=0, ) layer_wo_bos_eos = layer_wo_bos_eos[:, 1:, :] layer_wo_bos_eos = tf.reverse_sequence( layer_wo_bos_eos, sequence_length_wo_bos_eos, seq_axis=1, batch_axis=0, ) layers_without_bos_eos.append(layer_wo_bos_eos) # concatenate the layers lm_embeddings = tf.concat( [tf.expand_dims(t, axis=1) for t in layers_without_bos_eos], axis=1) # get the mask op without bos/eos. # tf doesn't support reversing boolean tensors, so cast # to int then back mask_wo_bos_eos = tf.cast(lm_graph.mask[:, 1:], 'int32') mask_wo_bos_eos = tf.reverse_sequence( mask_wo_bos_eos, lm_graph.sequence_lengths - 1, seq_axis=1, batch_axis=0, ) mask_wo_bos_eos = mask_wo_bos_eos[:, 1:] mask_wo_bos_eos = tf.reverse_sequence( mask_wo_bos_eos, sequence_length_wo_bos_eos, seq_axis=1, batch_axis=0, ) mask_wo_bos_eos = tf.cast(mask_wo_bos_eos, 'bool') return { 'lm_embeddings': lm_embeddings, 'lengths': sequence_length_wo_bos_eos, 'token_embeddings': lm_graph.embedding, 'mask': mask_wo_bos_eos, }
def train_cnn(): """Training CNN model.""" # Load sentences, labels, and training parameters logger.info("✔︎ Loading data...") logger.info("✔︎ Training data processing...") train_data = dh.load_data_and_labels(FLAGS.training_data_file, FLAGS.num_classes, FLAGS.embedding_dim, data_aug_flag=False) logger.info("✔︎ Validation data processing...") val_data = dh.load_data_and_labels(FLAGS.validation_data_file, FLAGS.num_classes, FLAGS.embedding_dim, data_aug_flag=False) logger.info("Recommended padding Sequence length is: {0}".format(FLAGS.pad_seq_len)) logger.info("✔︎ Training data padding...") x_train, y_train = dh.pad_data(train_data, FLAGS.pad_seq_len) logger.info("✔︎ Validation data padding...") x_val, y_val = dh.pad_data(val_data, FLAGS.pad_seq_len) # Build vocabulary VOCAB_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(FLAGS.embedding_dim) # Build a graph and cnn object with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=FLAGS.pad_seq_len, num_classes=FLAGS.num_classes, vocab_size=VOCAB_SIZE, fc_hidden_size=FLAGS.fc_hidden_size, embedding_size=FLAGS.embedding_dim, embedding_type=FLAGS.embedding_type, filter_sizes=list(map(int, FLAGS.filter_sizes.split(','))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda, pretrained_embedding=pretrained_word2vec_matrix) # Define training procedure with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): learning_rate = tf.train.exponential_decay(learning_rate=FLAGS.learning_rate, global_step=cnn.global_step, decay_steps=FLAGS.decay_steps, decay_rate=FLAGS.decay_rate, staircase=True) optimizer = tf.train.AdamOptimizer(learning_rate) grads, vars = zip(*optimizer.compute_gradients(cnn.loss)) grads, _ = tf.clip_by_global_norm(grads, clip_norm=FLAGS.norm_ratio) train_op = optimizer.apply_gradients(zip(grads, vars), global_step=cnn.global_step, name="train_op") # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in zip(grads, vars): if g is not None: grad_hist_summary = tf.summary.histogram("{0}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar("{0}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries if FLAGS.train_or_restore == 'R': MODEL = input("☛ Please input the checkpoints model you want to restore, " "it should be like(1490175368): ") # The model you want to restore while not (MODEL.isdigit() and len(MODEL) == 10): MODEL = input("✘ The format of your input is illegal, please re-input: ") logger.info("✔︎ The format of your input is legal, now loading to next step...") out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", MODEL)) logger.info("✔︎ Writing to {0}\n".format(out_dir)) else: timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) logger.info("✔︎ Writing to {0}\n".format(out_dir)) checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) best_checkpoint_dir = os.path.abspath(os.path.join(out_dir, "bestcheckpoints")) # Summaries for loss loss_summary = tf.summary.scalar("loss", cnn.loss) # Train summaries train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # Validation summaries validation_summary_op = tf.summary.merge([loss_summary]) validation_summary_dir = os.path.join(out_dir, "summaries", "validation") validation_summary_writer = tf.summary.FileWriter(validation_summary_dir, sess.graph) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir, num_to_keep=3, maximize=True) if FLAGS.train_or_restore == 'R': # Load cnn model logger.info("✔︎ Loading model...") checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) logger.info(checkpoint_file) # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) else: if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # Embedding visualization config config = projector.ProjectorConfig() embedding_conf = config.embeddings.add() embedding_conf.tensor_name = "embedding" embedding_conf.metadata_path = FLAGS.metadata_file projector.visualize_embeddings(train_summary_writer, config) projector.visualize_embeddings(validation_summary_writer, config) # Save the embedding visualization saver.save(sess, os.path.join(out_dir, "embedding", "embedding.ckpt")) current_step = sess.run(cnn.global_step) def train_step(x_batch, y_batch): """A single training step""" feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob, cnn.is_training: True } _, step, summaries, loss = sess.run( [train_op, cnn.global_step, train_summary_op, cnn.loss], feed_dict) logger.info("step {0}: loss {1:g}".format(step, loss)) train_summary_writer.add_summary(summaries, step) def validation_step(x_val, y_val, writer=None): """Evaluates model on a validation set""" batches_validation = dh.batch_iter(list(zip(x_val, y_val)), FLAGS.batch_size, 1) # Predict classes by threshold or topk ('ts': threshold; 'tk': topk) eval_counter, eval_loss = 0, 0.0 eval_pre_tk = [0.0] * FLAGS.top_num eval_rec_tk = [0.0] * FLAGS.top_num eval_F_tk = [0.0] * FLAGS.top_num true_onehot_labels = [] predicted_onehot_scores = [] predicted_onehot_labels_ts = [] predicted_onehot_labels_tk = [[] for _ in range(FLAGS.top_num)] for batch_validation in batches_validation: x_batch_val, y_batch_val = zip(*batch_validation) feed_dict = { cnn.input_x: x_batch_val, cnn.input_y: y_batch_val, cnn.dropout_keep_prob: 1.0, cnn.is_training: False } step, summaries, scores, cur_loss = sess.run( [cnn.global_step, validation_summary_op, cnn.scores, cnn.loss], feed_dict) # Prepare for calculating metrics for i in y_batch_val: true_onehot_labels.append(i) for j in scores: predicted_onehot_scores.append(j) # Predict by threshold batch_predicted_onehot_labels_ts = \ dh.get_onehot_label_threshold(scores=scores, threshold=FLAGS.threshold) for k in batch_predicted_onehot_labels_ts: predicted_onehot_labels_ts.append(k) # Predict by topK for top_num in range(FLAGS.top_num): batch_predicted_onehot_labels_tk = dh.get_onehot_label_topk(scores=scores, top_num=top_num+1) for i in batch_predicted_onehot_labels_tk: predicted_onehot_labels_tk[top_num].append(i) eval_loss = eval_loss + cur_loss eval_counter = eval_counter + 1 if writer: writer.add_summary(summaries, step) eval_loss = float(eval_loss / eval_counter) # Calculate Precision & Recall & F1 (threshold & topK) eval_pre_ts = precision_score(y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_ts), average='micro') eval_rec_ts = recall_score(y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_ts), average='micro') eval_F_ts = f1_score(y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_ts), average='micro') for top_num in range(FLAGS.top_num): eval_pre_tk[top_num] = precision_score(y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_tk[top_num]), average='micro') eval_rec_tk[top_num] = recall_score(y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_tk[top_num]), average='micro') eval_F_tk[top_num] = f1_score(y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_tk[top_num]), average='micro') # Calculate the average AUC eval_auc = roc_auc_score(y_true=np.array(true_onehot_labels), y_score=np.array(predicted_onehot_scores), average='micro') # Calculate the average PR eval_prc = average_precision_score(y_true=np.array(true_onehot_labels), y_score=np.array(predicted_onehot_scores), average='micro') return eval_loss, eval_auc, eval_prc, eval_rec_ts, eval_pre_ts, eval_F_ts, \ eval_rec_tk, eval_pre_tk, eval_F_tk # Generate batches batches_train = dh.batch_iter( list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) num_batches_per_epoch = int((len(x_train) - 1) / FLAGS.batch_size) + 1 # Training loop. For each batch... for batch_train in batches_train: x_batch_train, y_batch_train = zip(*batch_train) train_step(x_batch_train, y_batch_train) current_step = tf.train.global_step(sess, cnn.global_step) if current_step % FLAGS.evaluate_every == 0: logger.info("\nEvaluation:") eval_loss, eval_auc, eval_prc, \ eval_rec_ts, eval_pre_ts, eval_F_ts, eval_rec_tk, eval_pre_tk, eval_F_tk = \ validation_step(x_val, y_val, writer=validation_summary_writer) logger.info("All Validation set: Loss {0:g} | AUC {1:g} | AUPRC {2:g}" .format(eval_loss, eval_auc, eval_prc)) # Predict by threshold logger.info("☛ Predict by threshold: Precision {0:g}, Recall {1:g}, F {2:g}" .format(eval_pre_ts, eval_rec_ts, eval_F_ts)) # Predict by topK logger.info("☛ Predict by topK:") for top_num in range(FLAGS.top_num): logger.info("Top{0}: Precision {1:g}, Recall {2:g}, F {3:g}" .format(top_num+1, eval_pre_tk[top_num], eval_rec_tk[top_num], eval_F_tk[top_num])) best_saver.handle(eval_prc, sess, current_step) if current_step % FLAGS.checkpoint_every == 0: checkpoint_prefix = os.path.join(checkpoint_dir, "model") path = saver.save(sess, checkpoint_prefix, global_step=current_step) logger.info("✔︎ Saved model checkpoint to {0}\n".format(path)) if current_step % num_batches_per_epoch == 0: current_epoch = current_step // num_batches_per_epoch logger.info("✔︎ Epoch {0} has finished!".format(current_epoch)) logger.info("✔︎ Done.")
def train_optimizer(logdir, optimizer_spec, problems_and_data, num_problems, num_meta_iterations, num_unroll_func, num_partial_unroll_itrs_func, learning_rate=1e-4, gradient_clip=5., is_chief=False, select_random_problems=True, callbacks=None, obj_train_max_multiplier=-1, out=sys.stdout): """Trains the meta-parameters of this optimizer. Args: logdir: a directory filepath for storing model checkpoints (must exist) optimizer_spec: specification for an Optimizer (see utils.Spec) problems_and_data: a list of tuples containing three elements: a problem specification (see utils.Spec), a dataset (see datasets.Dataset), and a batch_size (int) for generating a problem and corresponding dataset. If the problem doesn't have data, set dataset to None. num_problems: the number of problems to sample during meta-training num_meta_iterations: the number of iterations (steps) to run the meta-optimizer for on each subproblem. num_unroll_func: called once per meta iteration and returns the number of unrolls to do for that meta iteration. num_partial_unroll_itrs_func: called once per unroll and returns the number of iterations to do for that unroll. learning_rate: learning rate of the RMSProp meta-optimizer (Default: 1e-4) gradient_clip: value to clip gradients at (Default: 5.0) is_chief: whether this is the chief task (Default: False) select_random_problems: whether to select training problems randomly (Default: True) callbacks: a list of callback functions that is run after every random problem draw obj_train_max_multiplier: the maximum increase in the objective value over a single training run. Ignored if < 0. out: where to write output to, e.g. a file handle (Default: sys.stdout) Raises: ValueError: If one of the subproblems has a negative objective value. """ if select_random_problems: # iterate over random draws of problem / dataset pairs sampler = (random.choice(problems_and_data) for _ in range(num_problems)) else: # iterate over a random shuffle of problems, looping if necessary num_repeats = (num_problems / len(problems_and_data)) + 1 random.shuffle(problems_and_data) sampler = (problems_and_data * num_repeats)[:num_problems] for problem_itr, (problem_spec, dataset, batch_size) in enumerate(sampler): # timer used to time how long it takes to initialize a problem problem_start_time = time.time() # if dataset is None, use the EMPTY_DATASET if dataset is None: dataset = datasets.EMPTY_DATASET batch_size = dataset.size # build a new graph for this problem graph = tf.Graph() real_device_setter = tf.train.replica_device_setter(FLAGS.ps_tasks) def custom_device_setter(op): # Places the local variables onto the workers. if trainable_optimizer.is_local_state_variable(op): return "/job:worker" else: return real_device_setter(op) if real_device_setter: device_setter = custom_device_setter else: device_setter = None with graph.as_default(), graph.device(device_setter): # initialize a problem problem = problem_spec.build() # build the optimizer opt = optimizer_spec.build() # get the meta-objective for training the optimizer train_output = opt.train(problem, dataset) state_keys = opt.state_keys for key, val in zip(state_keys, train_output.output_state[0]): finite_val = utils.make_finite(val, replacement=tf.zeros_like(val)) tf.summary.histogram("State/{}".format(key), finite_val, collections=[OPT_SUM_COLLECTION]) tf.summary.scalar("MetaObjective", train_output.metaobj, collections=[OPT_SUM_COLLECTION]) # Per-problem meta-objective tf.summary.scalar(problem_spec.callable.__name__ + "_MetaObjective", train_output.metaobj, collections=[OPT_SUM_COLLECTION]) # create the meta-train_op global_step = tf.Variable(0, name="global_step", trainable=False) meta_parameters = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=OPTIMIZER_SCOPE) # parameter regularization reg_l2 = FLAGS.l2_reg * sum([tf.reduce_sum(param ** 2) for param in meta_parameters]) # compute the meta-gradients meta_opt = tf.train.RMSPropOptimizer(learning_rate, decay=FLAGS.rms_decay, use_locking=True, epsilon=FLAGS.rms_epsilon) grads_and_vars = meta_opt.compute_gradients(train_output.metaobj + reg_l2, meta_parameters) # clip the gradients clipped_grads_and_vars = [] for grad, var in grads_and_vars: clipped_grad = tf.clip_by_value( utils.make_finite(grad, replacement=tf.zeros_like(var)), -gradient_clip, gradient_clip) clipped_grads_and_vars.append((clipped_grad, var)) # histogram summary of grads and vars for grad, var in grads_and_vars: tf.summary.histogram( var.name + "_rawgrad", utils.make_finite( grad, replacement=tf.zeros_like(grad)), collections=[OPT_SUM_COLLECTION]) for grad, var in clipped_grads_and_vars: tf.summary.histogram(var.name + "_var", var, collections=[OPT_SUM_COLLECTION]) tf.summary.histogram(var.name + "_grad", grad, collections=[OPT_SUM_COLLECTION]) # builds the train and summary operations train_op = meta_opt.apply_gradients(clipped_grads_and_vars, global_step=global_step) # only grab summaries defined for LOL, not inside the problem summary_op = tf.summary.merge_all(key=OPT_SUM_COLLECTION) # make sure the state gets propagated after the gradients and summaries # were computed. with tf.control_dependencies([train_op, summary_op]): propagate_loop_state_ops = [] for dest, src in zip( train_output.init_loop_vars, train_output.output_loop_vars): propagate_loop_state_ops.append(dest.assign(src)) propagate_loop_state_op = tf.group(*propagate_loop_state_ops) # create the supervisor sv = tf.train.Supervisor( graph=graph, is_chief=is_chief, logdir=logdir, summary_op=None, save_model_secs=0, # we save checkpoints manually global_step=global_step, ) with sv.managed_session() as sess: init_time = time.time() - problem_start_time out.write("--------- Problem #{} ---------\n".format(problem_itr)) out.write("{callable.__name__}{args}{kwargs}\n".format( **problem_spec.__dict__)) out.write("Took {} seconds to initialize.\n".format(init_time)) out.flush() # For profiling summaries if FLAGS.set_profiling: summary_writer = tf.summary.FileWriter(logdir, graph=sess.graph) # used to store information during training metadata = defaultdict(list) for k in range(num_meta_iterations): if sv.should_stop(): break problem.init_fn(sess) # set run options (for profiling) full_trace_opt = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_options = full_trace_opt if FLAGS.set_profiling else None run_metadata = tf.RunMetadata() if FLAGS.set_profiling else None num_unrolls = num_unroll_func() partial_unroll_iters = [ num_partial_unroll_itrs_func() for _ in xrange(num_unrolls) ] total_num_iter = sum(partial_unroll_iters) objective_weights = [np.ones(num) / float(num) for num in partial_unroll_iters] db = dataset.batch_indices(total_num_iter, batch_size) dataset_batches = [] last_index = 0 for num in partial_unroll_iters: dataset_batches.append(db[last_index:last_index + num]) last_index += num train_start_time = time.time() unroll_itr = 0 additional_log_info = "" for unroll_itr in range(num_unrolls): first_unroll = unroll_itr == 0 if FLAGS.reset_rnn_params: reset_state = first_unroll and k == 0 else: reset_state = first_unroll feed = { train_output.obj_weights: objective_weights[unroll_itr], train_output.batches: dataset_batches[unroll_itr], train_output.first_unroll: first_unroll, train_output.reset_state: reset_state, } # run the train and summary ops # when a "save_diagnostics" flag is turned on fetches_list = [ train_output.metaobj, train_output.problem_objectives, train_output.initial_obj, summary_op, clipped_grads_and_vars, train_op ] if unroll_itr + 1 < num_unrolls: fetches_list += [propagate_loop_state_op] fetched = sess.run(fetches_list, feed_dict=feed, options=run_options, run_metadata=run_metadata) meta_obj = fetched[0] sub_obj = fetched[1] init_obj = fetched[2] summ = fetched[3] meta_grads_and_params = fetched[4] # assert that the subproblem objectives are non-negative # (this is so that we can rescale the objective by the initial value # and not worry about rescaling by a negative value) if np.any(sub_obj < 0): raise ValueError( "Training problem objectives must be nonnegative.") # If the objective has increased more than we want, exit this # training run and start over on another meta iteration. if obj_train_max_multiplier > 0 and ( sub_obj[-1] > (init_obj + abs(init_obj) * (obj_train_max_multiplier - 1))): msg = " Broke early at {} out of {} unrolls. ".format( unroll_itr + 1, num_unrolls) additional_log_info += msg break # only the chief task is allowed to write the summary if is_chief: sv.summary_computed(sess, summ) metadata["subproblem_objs"].append(sub_obj) # store training metadata to pass to the callback metadata["meta_objs"].append(meta_obj) metadata["meta_grads_and_params"].append(meta_grads_and_params) optimization_time = time.time() - train_start_time if FLAGS.set_profiling: summary_name = "%02d_iter%04d_%02d" % (FLAGS.task, problem_itr, k) summary_writer.add_run_metadata(run_metadata, summary_name) metadata["global_step"].append(sess.run(global_step)) metadata["runtimes"].append(optimization_time) # write a diagnostic message to the output args = (k, meta_obj, optimization_time, sum(partial_unroll_iters[:unroll_itr + 1])) out.write(" [{:02}] {}, {} seconds, {} iters ".format(*args)) out.write("(unrolled {} steps)".format( ", ".join([str(s) for s in partial_unroll_iters[:unroll_itr + 1]]))) out.write("{}\n".format(additional_log_info)) out.flush() if FLAGS.set_profiling: summary_writer.close() # force a checkpoint save before we load a new problem # only the chief task has the save_path and can write the checkpoint if is_chief: sv.saver.save(sess, sv.save_path, global_step=global_step) # run the callbacks on the chief if is_chief and callbacks is not None: for callback in callbacks: if hasattr(callback, "__call__"): problem_name = problem_spec.callable.__name__ callback(problem_name, problem_itr, logdir, metadata)
def _build_lstms(self): # now the LSTMs # these will collect the initial states for the forward # (and reverse LSTMs if we are doing bidirectional) # parse the options lstm_dim = self.options['lstm']['dim'] projection_dim = self.options['lstm']['projection_dim'] n_lstm_layers = self.options['lstm'].get('n_layers', 1) cell_clip = self.options['lstm'].get('cell_clip') proj_clip = self.options['lstm'].get('proj_clip') use_skip_connections = self.options['lstm']['use_skip_connections'] if use_skip_connections: print("USING SKIP CONNECTIONS") else: print("NOT USING SKIP CONNECTIONS") # the sequence lengths from input mask if self.use_character_inputs: mask = tf.reduce_any(self.ids_placeholder > 0, axis=2) else: mask = self.ids_placeholder > 0 sequence_lengths = tf.reduce_sum(tf.cast(mask, tf.int32), axis=1) batch_size = tf.shape(sequence_lengths)[0] # for each direction, we'll store tensors for each layer self.lstm_outputs = {'forward': [], 'backward': []} self.lstm_state_sizes = {'forward': [], 'backward': []} self.lstm_init_states = {'forward': [], 'backward': []} self.lstm_final_states = {'forward': [], 'backward': []} update_ops = [] for direction in ['forward', 'backward']: if direction == 'forward': layer_input = self.embedding else: layer_input = tf.reverse_sequence(self.embedding, sequence_lengths, seq_axis=1, batch_axis=0) for i in range(n_lstm_layers): if projection_dim < lstm_dim: # are projecting down output lstm_cell = tf.nn.rnn_cell.LSTMCell( lstm_dim, num_proj=projection_dim, cell_clip=cell_clip, proj_clip=proj_clip) else: lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim, cell_clip=cell_clip, proj_clip=proj_clip) if use_skip_connections: # ResidualWrapper adds inputs to outputs if i == 0: # don't add skip connection from token embedding to # 1st layer output pass else: # add a skip connection lstm_cell = tf.nn.rnn_cell.ResidualWrapper(lstm_cell) # collect the input state, run the dynamic rnn, collect # the output state_size = lstm_cell.state_size # the LSTMs are stateful. To support multiple batch sizes, # we'll allocate size for states up to max_batch_size, # then use the first batch_size entries for each batch init_states = [ tf.Variable(tf.zeros([self._max_batch_size, dim]), trainable=False) for dim in lstm_cell.state_size ] batch_init_states = [ state[:batch_size, :] for state in init_states ] if direction == 'forward': i_direction = 0 else: i_direction = 1 variable_scope_name = 'RNN_{0}/RNN/MultiRNNCell/Cell{1}'.format( i_direction, i) with tf.variable_scope(variable_scope_name): layer_output, final_state = tf.nn.dynamic_rnn( lstm_cell, layer_input, sequence_length=sequence_lengths, initial_state=tf.nn.rnn_cell.LSTMStateTuple( *batch_init_states), ) self.lstm_state_sizes[direction].append(lstm_cell.state_size) self.lstm_init_states[direction].append(init_states) self.lstm_final_states[direction].append(final_state) if direction == 'forward': self.lstm_outputs[direction].append(layer_output) else: self.lstm_outputs[direction].append( tf.reverse_sequence(layer_output, sequence_lengths, seq_axis=1, batch_axis=0)) with tf.control_dependencies([layer_output]): # update the initial states for i in range(2): new_state = tf.concat([ final_state[i][:batch_size, :], init_states[i][batch_size:, :] ], axis=0) state_update_op = tf.assign(init_states[i], new_state) update_ops.append(state_update_op) layer_input = layer_output self.mask = mask self.sequence_lengths = sequence_lengths self.update_state_op = tf.group(*update_ops)
def mean_var_with_update(): with tf.control_dependencies([ema_apply_op]): return tf.identity(batch_mean), tf.identity(batch_var)
def _model_fn(features, labels, mode, params, variable_filter_fn=None): """Model defination for the Mask-RCNN model based on ResNet. Args: features: the input image tensor and auxiliary information, such as `image_info` and `source_ids`. The image tensor has a shape of [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include score targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. """ if mode == tf.estimator.ModeKeys.PREDICT: if params['include_groundtruth_in_features'] and ('labels' in features): # In include groundtruth for eval. labels = features['labels'] else: labels = None if 'features' in features: features = features['features'] # Otherwise, it is in export mode, the features is past in directly. if params['use_bfloat16']: with tf.contrib.tpu.bfloat16_scope(): model_outputs = build_model_graph( features, labels, mode == tf.estimator.ModeKeys.TRAIN, params) model_outputs.update({ 'source_id': features['source_ids'], 'image_info': features['image_info'], }) def cast_outputs_to_float(d): for k, v in sorted(six.iteritems(d)): if isinstance(v, dict): cast_outputs_to_float(v) else: d[k] = tf.cast(v, tf.float32) cast_outputs_to_float(model_outputs) else: model_outputs = build_model_graph(features, labels, mode == tf.estimator.ModeKeys.TRAIN, params) model_outputs.update({ 'source_id': features['source_ids'], 'image_info': features['image_info'], }) # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = {} if labels and params['include_groundtruth_in_features']: # Labels can only be emebeded in predictions. The predition cannot output # dictionary as a value. predictions.update(labels) model_outputs.pop('fpn_features', None) predictions.update(model_outputs) if params['use_tpu']: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. global_step = tf.train.get_or_create_global_step() learning_rate = learning_rates.step_learning_rate_with_linear_warmup( global_step, params['init_learning_rate'], params['warmup_learning_rate'], params['warmup_steps'], params['learning_rate_levels'], params['learning_rate_steps']) # score_loss and box_loss are for logging. only total_loss is optimized. total_rpn_loss, rpn_score_loss, rpn_box_loss = losses.rpn_loss( model_outputs['rpn_score_outputs'], model_outputs['rpn_box_outputs'], labels, params) (total_fast_rcnn_loss, fast_rcnn_class_loss, fast_rcnn_box_loss) = losses.fast_rcnn_loss( model_outputs['class_outputs'], model_outputs['box_outputs'], model_outputs['class_targets'], model_outputs['box_targets'], params) # Only training has the mask loss. Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/model_builder.py # pylint: disable=line-too-long if mode == tf.estimator.ModeKeys.TRAIN and params['include_mask']: mask_loss = losses.mask_rcnn_loss( model_outputs['mask_outputs'], model_outputs['mask_targets'], model_outputs['selected_class_targets'], params) else: mask_loss = 0. if variable_filter_fn and ('resnet' in params['backbone']): var_list = variable_filter_fn(tf.trainable_variables(), params['backbone'] + '/') else: var_list = tf.trainable_variables() l2_regularization_loss = params['l2_weight_decay'] * tf.add_n([ tf.nn.l2_loss(v) for v in var_list if 'batch_normalization' not in v.name and 'bias' not in v.name ]) total_loss = (total_rpn_loss + total_fast_rcnn_loss + mask_loss + l2_regularization_loss) host_call = None if mode == tf.estimator.ModeKeys.TRAIN: optimizer = create_optimizer(learning_rate, params) if params['use_tpu']: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) scaffold_fn = None if params['warm_start_path']: def warm_start_scaffold_fn(): tf.logging.info('model_fn warm start from: %s",' % params['warm_start_path']) assignment_map = _build_assigment_map( optimizer, prefix=None, skip_variables_regex=params['skip_checkpoint_variables']) tf.train.init_from_checkpoint(params['warm_start_path'], assignment_map) return tf.train.Scaffold() scaffold_fn = warm_start_scaffold_fn elif params['checkpoint']: def backbone_scaffold_fn(): """Loads pretrained model through scaffold function.""" # Exclude all variable of optimizer. vars_to_load = _build_assigment_map( optimizer, prefix=params['backbone'] + '/', skip_variables_regex=params['skip_checkpoint_variables']) tf.train.init_from_checkpoint(params['checkpoint'], vars_to_load) if not vars_to_load: raise ValueError('Variables to load is empty.') return tf.train.Scaffold() scaffold_fn = backbone_scaffold_fn # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) if params['global_gradient_clip_ratio'] > 0: # Clips the gradients for training stability. # Refer: https://arxiv.org/abs/1211.5063 with tf.name_scope('clipping'): old_grads, variables = zip(*grads_and_vars) num_weights = sum(g.shape.num_elements() for g in old_grads if g is not None) clip_norm = params['global_gradient_clip_ratio'] * math.sqrt( num_weights) tf.logging.info( 'Global clip norm set to %g for %d variables with %d elements.' % (clip_norm, sum( 1 for g in old_grads if g is not None), num_weights)) gradients, _ = tf.clip_by_global_norm(old_grads, clip_norm) else: gradients, variables = zip(*grads_and_vars) grads_and_vars = [] # Special treatment for biases (beta is named as bias in reference model) # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/optimizer.py#L113 # pylint: disable=line-too-long for grad, var in zip(gradients, variables): if grad is not None and ('beta' in var.name or 'bias' in var.name): grad = 2.0 * grad grads_and_vars.append((grad, var)) minimize_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) with tf.control_dependencies(update_ops): train_op = minimize_op if params['use_host_call']: def host_call_fn(global_step, total_loss, total_rpn_loss, rpn_score_loss, rpn_box_loss, total_fast_rcnn_loss, fast_rcnn_class_loss, fast_rcnn_box_loss, mask_loss, learning_rate): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: global_step: `Tensor with shape `[batch, ]` for the global_step. total_loss: `Tensor` with shape `[batch, ]` for the training loss. total_rpn_loss: `Tensor` with shape `[batch, ]` for the training RPN loss. rpn_score_loss: `Tensor` with shape `[batch, ]` for the training RPN score loss. rpn_box_loss: `Tensor` with shape `[batch, ]` for the training RPN box loss. total_fast_rcnn_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN loss. fast_rcnn_class_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN class loss. fast_rcnn_box_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN box loss. mask_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN mask loss. learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate. Returns: List of summary ops to run on the CPU host. """ # Outfeed supports int32 but global_step is expected to be int64. global_step = tf.reduce_mean(global_step) # Host call fns are executed FLAGS.iterations_per_loop times after one # TPU loop is finished, setting max_queue value to the same as number of # iterations will make the summary writer only flush the data to storage # once per loop. with (tf.contrib.summary.create_file_writer( params['model_dir'], max_queue=params['iterations_per_loop']).as_default()): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('total_loss', tf.reduce_mean(total_loss), step=global_step) tf.contrib.summary.scalar( 'total_rpn_loss', tf.reduce_mean(total_rpn_loss), step=global_step) tf.contrib.summary.scalar( 'rpn_score_loss', tf.reduce_mean(rpn_score_loss), step=global_step) tf.contrib.summary.scalar('rpn_box_loss', tf.reduce_mean(rpn_box_loss), step=global_step) tf.contrib.summary.scalar( 'total_fast_rcnn_loss', tf.reduce_mean(total_fast_rcnn_loss), step=global_step) tf.contrib.summary.scalar( 'fast_rcnn_class_loss', tf.reduce_mean(fast_rcnn_class_loss), step=global_step) tf.contrib.summary.scalar( 'fast_rcnn_box_loss', tf.reduce_mean(fast_rcnn_box_loss), step=global_step) if params['include_mask']: tf.contrib.summary.scalar( 'mask_loss', tf.reduce_mean(mask_loss), step=global_step) tf.contrib.summary.scalar( 'learning_rate', tf.reduce_mean(learning_rate), step=global_step) return tf.contrib.summary.all_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. global_step_t = tf.reshape(global_step, [1]) total_loss_t = tf.reshape(total_loss, [1]) total_rpn_loss_t = tf.reshape(total_rpn_loss, [1]) rpn_score_loss_t = tf.reshape(rpn_score_loss, [1]) rpn_box_loss_t = tf.reshape(rpn_box_loss, [1]) total_fast_rcnn_loss_t = tf.reshape(total_fast_rcnn_loss, [1]) fast_rcnn_class_loss_t = tf.reshape(fast_rcnn_class_loss, [1]) fast_rcnn_box_loss_t = tf.reshape(fast_rcnn_box_loss, [1]) mask_loss_t = tf.reshape(mask_loss, [1]) learning_rate_t = tf.reshape(learning_rate, [1]) host_call = (host_call_fn, [ global_step_t, total_loss_t, total_rpn_loss_t, rpn_score_loss_t, rpn_box_loss_t, total_fast_rcnn_loss_t, fast_rcnn_class_loss_t, fast_rcnn_box_loss_t, mask_loss_t, learning_rate_t ]) else: train_op = None scaffold_fn = None if params['use_tpu']: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) return tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op)
def train(): # load train set list and transform it to queue. try: with open('train_set_list.pickle', 'r') as f: train_set_list = pickle.load(f) except: raise EnvironmentError( 'Data list not existed. Please run generate_data_list.py first.') random.shuffle(train_set_list) train_set_queue = deque(train_set_list) train_set_size = len(train_set_list) del train_set_list print('Training set built. Size: ' + str(train_set_size)) # build the tensorflow graph. with tf.Graph().as_default() as g: global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) num_batches_per_epoch = train_set_size / BATCH_SIZE decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) tf.summary.scalar('learning_rate', lr) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) images = tf.placeholder(tf.float32, shape=[BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, 3]) labels = tf.placeholder(tf.int32, shape=[BATCH_SIZE]) logits = inception.inference(images, NUM_CLASSES, for_training=True, restore_logits=FLAGS.fine_tune, scope=None) inception.loss(logits, labels, batch_size=BATCH_SIZE) # Assemble all of the losses for the current tower only. losses = tf.get_collection(slim.losses.LOSSES_COLLECTION, scope=None) # Calculate the total loss for the current tower. regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n(losses + regularization_losses, name='total_loss') # Compute the moving average of all individual losses and the total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # same for the averaged version of the losses. for l in losses + [total_loss]: # Name each loss as '(raw)' and name the moving average version of the loss # as the original loss name. tf.summary.scalar(l.op.name + ' (raw)', l) tf.summary.scalar(l.op.name, loss_averages.average(l)) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, scope=None) # Calculate the gradients for the batch of data on this ImageNet # tower. grads = opt.compute_gradients(total_loss) # Apply gradients. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) # Add histograms for gradients. for grad, var in grads: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init = tf.global_variables_initializer() # open session and initialize sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) sess.run(init) # restore old checkpoint if FLAGS.fine_tune: checkpoint = tf.train.get_checkpoint_state(FLAGS.ckpt_restore_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") else: variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) summary_writer = tf.summary.FileWriter( FLAGS.ckpt_save_dir, graph_def=sess.graph.as_graph_def(add_shapes=True)) step = 1 while step <= FLAGS.max_steps: start_time = time.time() # construct image batch and label batch for one step train minibatch = [] for count in xrange(0, BATCH_SIZE): element = train_set_queue.pop() minibatch.append(element) train_set_queue.appendleft(element) image_list = [load_image(d[0]) for d in minibatch] label_list = [d[1] for d in minibatch] image_batch = np.array(image_list) label_batch = np.array(label_list) image_batch = np.reshape(image_batch, [BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, 3]) label_batch = np.reshape(label_batch, [BATCH_SIZE]) _, loss_value = sess.run([train_op, total_loss], feed_dict={ images: image_batch, labels: label_batch }) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step == 1 or step % 10 == 0: num_examples_per_step = BATCH_SIZE examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) # shuttle the image list per epoch if step % num_batches_per_epoch == 0: random.shuffle(train_set_queue) # write summary periodically if step == 1 or step % 100 == 0: summary_str = sess.run(summary_op, feed_dict={ images: image_batch, labels: label_batch }) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0: checkpoint_path = os.path.join(FLAGS.ckpt_save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) step += 1
def main(_): # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) # Start a new TensorFlow session. sess = tf.InteractiveSession() optimizer_name = FLAGS.optimizer optimizer_params = {} clip_gradients = 0.0 if FLAGS.optimizer.find(":") > 0: optimizer = _maybe_load_yaml(FLAGS.optimizer) optimizer_name = optimizer["name"] if "params" in optimizer: optimizer_params = optimizer["params"] if "clip_gradients" in optimizer: clip_gradients = optimizer["clip_gradients"] tf.logging.info("optimizer_name = {} optimizer_params = {}".format( optimizer_name, optimizer_params)) # Begin by making sure we have the training data we need. If you already have # training data of your own, use `--data_url= ` on the command line to avoid # downloading. model_settings = models.prepare_model_settings( len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count, FLAGS.feature_type) fingerprint_size = model_settings['fingerprint_size'] label_count = model_settings['label_count'] time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000) # Figure out the learning rates for each training phase. Since it's often # effective to have high learning rates at the start of training, followed by # lower levels towards the end, the number of steps and learning rates can be # specified as comma-separated lists to define the rate at each stage. For # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001 # will run 13,000 training loops in total, with a rate of 0.001 for the first # 10,000, and 0.0001 for the final 3,000. training_steps_list = list( map(int, FLAGS.how_many_training_steps.split(','))) learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') logits, dropout_prob = models.create_model(fingerprint_input, model_settings, FLAGS.model_architecture, hparam_string=FLAGS.hparams, is_training=True) # Define loss and optimizer ground_truth_input = tf.placeholder(tf.float32, [None, label_count], name='groundtruth_input') # Optionally we can add runtime checks to spot when NaNs or other symptoms of # numerical errors start occurring during training. control_dependencies = [] if FLAGS.check_nans: checks = tf.add_check_numerics_ops() control_dependencies = [checks] # Create the back propagation and training evaluation machinery in the graph. with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=ground_truth_input, logits=logits)) tf.summary.scalar('cross_entropy', cross_entropy_mean) with tf.name_scope('train'), tf.control_dependencies(control_dependencies): learning_rate_input = tf.placeholder(tf.float32, [], name='learning_rate_input') if optimizer_name == 'Momentum': optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate_input, **optimizer_params) elif optimizer_name.lower() == 'nadam': optimizer = tf.contrib.opt.NadamOptimizer( learning_rate=learning_rate_input, **optimizer_params) else: optimizer = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name]( learning_rate=learning_rate_input, **optimizer_params) def _clip_gradients(grads_and_vars, value): """Clips gradients by global norm.""" gradients, variables = zip(*grads_and_vars) gradients, _ = tf.clip_by_global_norm(gradients, value) return list(zip(gradients, variables)) reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) if len(reg_loss): tf.logging.info("add regularization loss") grad_vars = optimizer.compute_gradients(cross_entropy_mean + tf.reduce_mean(reg_loss)) else: grad_vars = optimizer.compute_gradients(cross_entropy_mean) if clip_gradients > 0.0: grad_vars = _clip_gradients(grad_vars, clip_gradients) # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: # https://github.com/tensorflow/tensorflow/issues/1122 with tf.control_dependencies(tf.get_collection( tf.GraphKeys.UPDATE_OPS)): train_step = optimizer.apply_gradients(grad_vars) predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', evaluation_step) global_step = tf.contrib.framework.get_or_create_global_step() increment_global_step = tf.assign(global_step, global_step + 1) saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) # Merge all the summaries and write them out to /tmp/retrain_logs (by default) merged_summaries = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation') tf.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) start_step = global_step.eval(session=sess) else: checkpoint_path = tf.train.latest_checkpoint(FLAGS.train_dir) if checkpoint_path: models.load_variables_from_checkpoint(sess, checkpoint_path) start_step = global_step.eval(session=sess) tf.logging.info('Training from step: %d ', start_step) # Save graph.pbtxt. tf.train.write_graph(sess.graph_def, FLAGS.train_dir, FLAGS.model_architecture + '.pbtxt') audio_processor = input_data.AudioProcessor( FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings, feature_scaling=FLAGS.feature_scaling) # Save list of words. with gfile.GFile( os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'), 'w') as f: f.write('\n'.join(audio_processor.words_list)) # Training loop. training_steps_max = np.sum(training_steps_list) data_offset = 0 audio_processor.shuffle_data('training') for training_step in xrange(start_step, training_steps_max + 1): # Figure out what the current learning rate is. training_steps_sum = 0 for i in range(len(training_steps_list)): training_steps_sum += training_steps_list[i] if training_step <= training_steps_sum: learning_rate_value = learning_rates_list[i] break if data_offset > audio_processor.set_size( 'training') - FLAGS.batch_size: data_offset = 0 audio_processor.shuffle_data('training') # Pull the audio samples we'll use for training. data_start = time.time() train_fingerprints, train_ground_truth = audio_processor.get_data( FLAGS.batch_size, data_offset, model_settings, FLAGS.background_frequency, FLAGS.background_volume, time_shift_samples, 'training', sess) tf.logging.info("---- get_data %s seconds ----" % str(time.time() - data_start)[:5]) data_offset += FLAGS.batch_size # Run the graph with this batch of training data. train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run( [ merged_summaries, evaluation_step, cross_entropy_mean, train_step, increment_global_step ], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth, learning_rate_input: learning_rate_value, dropout_prob: FLAGS.dropout_prob }) train_writer.add_summary(train_summary, training_step) if training_step % 10 == 1: tf.logging.info( 'Time: %s, Epoch #%d: step #%d: rate %f, accuracy %.1f%%, cross entropy %f' % (time.asctime(), int((training_step * FLAGS.batch_size) / audio_processor.set_size('training')), training_step, learning_rate_value, train_accuracy * 100, cross_entropy_value)) is_last_step = (training_step == training_steps_max) if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step: set_size = audio_processor.set_size('validation') total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) # Run a validation step and capture training summaries for TensorBoard # with the `merged` op. validation_summary, validation_accuracy, conf_matrix = sess.run( [merged_summaries, evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, dropout_prob: 1.0 }) validation_writer.add_summary(validation_summary, training_step) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' % (training_step, total_accuracy * 100, set_size)) # Save the model checkpoint periodically. if (training_step % FLAGS.save_step_interval == 0 or training_step == training_steps_max): checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): test_fingerprints, test_ground_truth = audio_processor.get_data( FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess) test_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, dropout_prob: 1.0 }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100, set_size))
def main(unused_args): logging.info('Training IBP on %s...', FLAGS.dataset.upper()) step = tf.train.get_or_create_global_step() # Learning rate. learning_rate = ibp.parse_learning_rate(step, FLAGS.learning_rate) # Dataset. input_bounds = (0., 1.) num_classes = 10 if FLAGS.dataset == 'mnist': data_train, data_test = tf.keras.datasets.mnist.load_data() else: assert FLAGS.dataset == 'cifar10', ( 'Unknown dataset "{}"'.format(FLAGS.dataset)) data_train, data_test = tf.keras.datasets.cifar10.load_data() data_train = (data_train[0], data_train[1].flatten()) data_test = (data_test[0], data_test[1].flatten()) data = ibp.build_dataset(data_train, batch_size=FLAGS.batch_size, sequential=False) if FLAGS.dataset == 'cifar10': data = data._replace(image=ibp.randomize( data.image, (32, 32, 3), expand_shape=(40, 40, 3), crop_shape=(32, 32, 3), vertical_flip=True)) # Base predictor network. original_predictor = ibp.DNN(num_classes, layers(FLAGS.model)) predictor = original_predictor if FLAGS.dataset == 'cifar10': mean = (0.4914, 0.4822, 0.4465) std = (0.2023, 0.1994, 0.2010) predictor = ibp.add_image_normalization(original_predictor, mean, std) predictor = ibp.VerifiableModelWrapper(predictor) # Training. train_losses, train_loss, _ = ibp.create_classification_losses( step, data.image, data.label, predictor, FLAGS.epsilon_train, loss_weights={ 'nominal': {'init': FLAGS.nominal_xent_init, 'final': FLAGS.nominal_xent_final}, 'attack': {'init': FLAGS.attack_xent_init, 'final': FLAGS.attack_xent_final}, 'verified': {'init': FLAGS.verified_xent_init, 'final': FLAGS.verified_xent_final}, }, warmup_steps=FLAGS.warmup_steps, rampup_steps=FLAGS.rampup_steps, input_bounds=input_bounds) saver = tf.train.Saver(original_predictor.get_variables()) optimizer = tf.train.AdamOptimizer(learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(train_loss, step) # Test using while loop. def get_test_metrics(batch_size, attack_builder=ibp.UntargetedPGDAttack): """Returns the test metrics.""" num_test_batches = len(data_test[0]) // batch_size assert len(data_test[0]) % batch_size == 0, ( 'Test data is not a multiple of batch size.') def cond(i, *unused_args): return i < num_test_batches def body(i, metrics): """Compute the sum of all metrics.""" test_data = ibp.build_dataset(data_test, batch_size=batch_size, sequential=True) predictor(test_data.image, is_training=False) input_interval_bounds = ibp.IntervalBounds( tf.maximum(test_data.image - FLAGS.epsilon, input_bounds[0]), tf.minimum(test_data.image + FLAGS.epsilon, input_bounds[1])) predictor.propagate_bounds(input_interval_bounds) test_specification = ibp.ClassificationSpecification( test_data.label, num_classes) test_attack = attack_builder(predictor, test_specification, FLAGS.epsilon, input_bounds=input_bounds, optimizer_builder=ibp.UnrolledAdam) test_losses = ibp.Losses(predictor, test_specification, test_attack) test_losses(test_data.label) new_metrics = [] for m, n in zip(metrics, test_losses.scalar_metrics): new_metrics.append(m + n) return i + 1, new_metrics total_count = tf.constant(0, dtype=tf.int32) total_metrics = [tf.constant(0, dtype=tf.float32) for _ in range(len(ibp.ScalarMetrics._fields))] total_count, total_metrics = tf.while_loop( cond, body, loop_vars=[total_count, total_metrics], back_prop=False, parallel_iterations=1) total_count = tf.cast(total_count, tf.float32) test_metrics = [] for m in total_metrics: test_metrics.append(m / total_count) return ibp.ScalarMetrics(*test_metrics) test_metrics = get_test_metrics( FLAGS.batch_size, ibp.UntargetedPGDAttack) summaries = [] for f in test_metrics._fields: summaries.append( tf.summary.scalar(f, getattr(test_metrics, f))) test_summaries = tf.summary.merge(summaries) test_writer = tf.summary.FileWriter(os.path.join(FLAGS.output_dir, 'test')) # Run everything. tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.train.SingularMonitoredSession(config=tf_config) as sess: for _ in range(FLAGS.steps): iteration, loss_value, _ = sess.run( [step, train_losses.scalar_losses.nominal_cross_entropy, train_op]) if iteration % FLAGS.test_every_n == 0: metric_values, summary = sess.run([test_metrics, test_summaries]) test_writer.add_summary(summary, iteration) show_metrics(iteration, metric_values, loss_value=loss_value) saver.save(sess._tf_sess(), # pylint: disable=protected-access os.path.join(FLAGS.output_dir, 'model'), global_step=FLAGS.steps - 1)
def build_model(self): # Define data, placeholders, and tracking variables. self.real_points = load_2d_data( self.dataset, self.real_n, self.real_dim) self.real_sample = tf.placeholder( tf.float64, [None, self.real_dim], name='real_sample') self.real_sample_opt_trans = tf.placeholder( tf.float64, [None, self.real_dim], name='real_sample_opt_trans') self.z = tf.placeholder(tf.float64, [None, self.z_dim], name='z') self.z_opt_trans = tf.placeholder(tf.float64, [None, self.z_dim], name='z_opt_trans') self.z_preimage = tf.Variable(tf.random_normal( [self.real_n, self.z_dim], stddev=0.1, dtype=tf.float64), name='z_preimage') self.k_d = tf.Variable(0., dtype=tf.float64, trainable=False, name='k_d') self.k_g = tf.Variable(0., dtype=tf.float64, trainable=False, name='k_g') # Compute generator and autoencoder outputs. self.gen_z = generator( self.z, self.g_layers_width, self.g_layers_depth, self.g_activations, self.g_out_dim, reuse=False) self.gen_z_opt_trans = generator( self.z_opt_trans, self.g_layers_width, self.g_layers_depth, self.g_activations, self.g_out_dim, reuse=True) self.gen_z_preimage = generator(self.z_preimage, self.g_layers_width, self.g_layers_depth, self.g_activations, self.g_out_dim, reuse=True) self.ae_real_sample = decoder( encoder( self.real_sample, self.d_layers_width, self.d_layers_depth, self.d_activations, self.d_encoded_dim, reuse=False), self.d_layers_width, self.d_layers_depth, self.d_activations, self.d_out_dim, reuse=False) self.ae_gen_z = decoder( encoder( self.gen_z, self.d_layers_width, self.d_layers_depth, self.d_activations, self.d_encoded_dim, reuse=True), self.d_layers_width, self.d_layers_depth, self.d_activations, self.d_out_dim, reuse=True) self.ae_grid = decoder( encoder( tf.convert_to_tensor(self.grid), self.d_layers_width, self.d_layers_depth, self.d_activations, self.d_encoded_dim, reuse=True), self.d_layers_width, self.d_layers_depth, self.d_activations, self.d_out_dim, reuse=True) # Define autoencoder losses. self.ae_loss_real = tf.reduce_mean( tf.abs(self.ae_real_sample - self.real_sample)) self.ae_loss_gen = tf.reduce_mean(tf.abs(self.ae_gen_z - self.gen_z)) self.ae_loss_real_vals = tf.reduce_sum( tf.abs(self.ae_real_sample - self.real_sample), 1) self.ae_loss_gen_vals = tf.reduce_sum( tf.abs(self.ae_gen_z - self.gen_z), 1) self.ae_loss_grid_vals = tf.reduce_sum( tf.abs(self.ae_grid - self.grid), 1) # Define losses. self.d_loss = self.ae_loss_real - self.k_d * self.ae_loss_gen self.normality_loss = tf.py_func(self.normality_dist, [self.z_preimage], tf.float64) self.gen_z_preimage_loss = tf.reduce_mean( tf.abs(self.real_points - self.gen_z_preimage) ) + self.lambda_normality_loss * self.normality_loss # Define coverage loss formulas: munkres, moments. self.coverage_loss = tf.reduce_mean(tf.abs( self.gen_z_opt_trans - self.real_sample_opt_trans)) self.gen_z_m1 = tf.reduce_mean(tf.pow(self.gen_z, 1), axis=0) self.gen_z_m2 = tf.reduce_mean(tf.pow(self.gen_z, 2), axis=0) self.gen_z_var = self.gen_z_m2 - tf.square(self.gen_z_m1) self.real_m1 = tf.reduce_mean(tf.pow(self.real_sample, 1), axis=0) self.real_m2 = tf.reduce_mean(tf.pow(self.real_sample, 2), axis=0) self.real_var = self.real_m2 - tf.square(self.real_m1) self.cvg_loss_m1 = tf.norm(self.gen_z_m1 - self.real_m1) self.cvg_loss_m2 = tf.norm(self.gen_z_m2 - self.real_m2) self.cvg_loss_var = tf.norm(self.gen_z_var - self.real_var) self.coverage_loss_moments = ( self.cvg_loss_m1 + self.cvg_loss_m2 + self.cvg_loss_var) if self.training_z in ['preimage', 'mix']: self.g_loss = self.ae_loss_gen + self.k_g * (self.coverage_loss_moments + self.gen_z_preimage_loss) else: #self.g_loss = self.coverage_loss self.g_loss = self.ae_loss_gen + self.k_g * self.coverage_loss #self.g_loss = self.ae_loss_gen + self.coverage_loss_moments # Build optimization ops. self.g_vars = [ var for var in tf.global_variables() if 'generator' in var.name] self.d_vars = [ var for var in tf.global_variables() if 'autoencoder' in var.name] self.preimage_vars = [ var for var in tf.global_variables() if 'preimage' in var.name] if self.optimizer == 'adagrad': optimizer = tf.train.AdagradOptimizer else: optimizer = tf.train.AdamOptimizer # Define optimization nodes. self.d_optim = optimizer(self.d_lr).minimize( self.d_loss, var_list=self.d_vars) self.g_optim = optimizer(self.g_lr).minimize( self.g_loss, var_list=self.g_vars) self.z_optim = optimizer(self.c_lr).minimize( self.gen_z_preimage_loss, var_list=self.preimage_vars) self.emp_gamma_d = self.ae_loss_gen / self.ae_loss_real self.emp_gamma_g = self.ae_loss_gen / self.coverage_loss self.balance_d = self.gamma_d * self.ae_loss_real - self.ae_loss_gen self.balance_g = self.gamma_g * self.coverage_loss - self.ae_loss_gen self.measure = self.ae_loss_real + tf.abs(self.balance_d) #with tf.control_dependencies([self.d_optim, self.g_optim]): with tf.control_dependencies([self.d_optim]): self.k_d_update = tf.assign( self.k_d, tf.clip_by_value( self.k_d + self.lambda_k_d * self.balance_d, 0, 1)) with tf.control_dependencies([self.g_optim]): self.k_g_update = tf.assign( self.k_g, tf.clip_by_value( self.k_g + self.lambda_k_g * self.balance_g, 0, 1)) # Set up summary items. self.summary_op = tf.summary.merge([ tf.summary.scalar('loss/d_loss', self.d_loss), tf.summary.scalar('loss/ae_loss_real', self.ae_loss_real), tf.summary.scalar('loss/ae_loss_gen', self.ae_loss_gen), tf.summary.scalar('loss/g_loss', self.g_loss), tf.summary.scalar('loss/coverage_loss', self.coverage_loss), tf.summary.scalar('loss/normality_loss', self.normality_loss), tf.summary.scalar('balance/emp_gamma_d', self.emp_gamma_d), tf.summary.scalar('balance/emp_gamma_g', self.emp_gamma_g), tf.summary.scalar('balance/measure', self.measure), tf.summary.scalar('balance/k_d', self.k_d), tf.summary.scalar('balance/k_g', self.k_g), tf.summary.scalar('training/d_lr', self.d_lr), tf.summary.scalar('training/g_lr', self.g_lr), ]) tf.global_variables_initializer().run()
return output_gate * tf.tanh(state), state # Input data. train_data = list() for _ in range(num_unrollings + 1): train_data.append( tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size])) train_inputs = train_data[:num_unrollings] train_labels = train_data[1:] # labels are inputs shifted by one time step. # Unrolled LSTM loop. outputs = list() output = saved_output state = saved_state for i in train_inputs: output, state = lstm_cell(i, output, state) outputs.append(output) # State saving across unrollings. with tf.control_dependencies([saved_output.assign(output),saved_state.assign(state)]): # Classifier. logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits, tf.concat(0, train_labels))) # Optimizer. global_step = tf.Variable(0) learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True) optimizer = tf.train.GradientDescentOptimizer(learning_rate) gradients, v = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, 1.25) optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step) # Predictions. train_prediction = tf.nn.softmax(logits) # Sampling and validation eval: batch 1, no unrolling. sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size]) saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
# restore_vars = tf.contrib.framework.get_variables_to_restore(include=RESTORE_PART) # update_vars = tf.contrib.framework.get_variables_to_restore(include=UPDATE_PART) # saver_restore = tf.train.Saver(var_list=restore_vars) # Restore all vars (if continue training) # saver_restore = tf.train.Saver() # average model ema = tf.train.ExponentialMovingAverage(decay=0.99, num_updates=global_step) ema_op = ema.apply(tf.trainable_variables()) # Update the BN vars update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # train_step = optimizer.minimize(loss, global_step=global_step, var_list=update_vars+restore_vars) train_step = optimizer.minimize(loss, global_step=global_step) with tf.control_dependencies(update_ops): with tf.control_dependencies([train_step, ema_op]): train_op = tf.no_op(name='train') # Set session config config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.per_process_gpu_memory_fraction = 0.80 with tf.Session(config=config) as sess: sess.run((tf.global_variables_initializer(), tf.local_variables_initializer())) # saver_restore.restore(sess, RESTORE_PATH) saver = tf.train.Saver() write_op = tf.summary.merge_all() write_train = tf.summary.FileWriter('./log/train', sess.graph) write_val = tf.summary.FileWriter('./log/val')
def mean_var_with_update(): with tf.control_dependencies([update_moving_mean, update_moving_variance]): return tf.identity(mean), tf.identity(variance)
def train(total_loss, global_step, optimizer, learning_rate, moving_average_decay, update_gradient_vars, warm_flag,log_histograms=True): # Generate moving averages of all losses and associated summaries. loss_averages_op = _add_loss_summaries(total_loss) if warm_flag: # Compute gradients. with tf.control_dependencies([loss_averages_op]): if optimizer == 'ADAGRAD': opt = tf.train.AdagradOptimizer(learning_rate * hvd.size()) elif optimizer == 'ADADELTA': opt = tf.train.AdadeltaOptimizer(learning_rate * hvd.size(), rho=0.9, epsilon=1e-6) elif optimizer == 'ADAM': opt = tf.train.AdamOptimizer(learning_rate * hvd.size(), beta1=0.9, beta2=0.999, epsilon=0.1) elif optimizer == 'RMSPROP': opt = tf.train.RMSPropOptimizer(learning_rate * hvd.size(), decay=0.9, momentum=0.9, epsilon=1.0) elif optimizer == 'MOM': opt = tf.train.MomentumOptimizer(learning_rate * hvd.size(), 0.9, use_nesterov=True) else: raise ValueError('Invalid optimization algorithm') # Add Horovod Distributed Optimizer opt = hvd.DistributedOptimizer(opt) grads = opt.compute_gradients(total_loss, update_gradient_vars) else: # Compute gradients. with tf.control_dependencies([loss_averages_op]): if optimizer == 'ADAGRAD': opt = tf.train.AdagradOptimizer(learning_rate) elif optimizer == 'ADADELTA': opt = tf.train.AdadeltaOptimizer(learning_rate, rho=0.9, epsilon=1e-6) elif optimizer == 'ADAM': opt = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999, epsilon=0.1) elif optimizer == 'RMSPROP': opt = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.9, epsilon=1.0) elif optimizer == 'MOM': opt = tf.train.MomentumOptimizer(learning_rate, 0.9, use_nesterov=True) else: raise ValueError('Invalid optimization algorithm') # Add Horovod Distributed Optimizer opt = hvd.DistributedOptimizer(opt) grads = opt.compute_gradients(total_loss, update_gradient_vars) # Apply gradients. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. if log_histograms: for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) # Add histograms for gradients. if log_histograms: for grad, var in grads: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( moving_average_decay, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) with tf.control_dependencies([apply_gradient_op, variables_averages_op]): train_op = tf.no_op(name='train') return train_op