def testPiecewiseConstantEdgeCases(self): with self.test_session(): with self.assertRaises(ValueError): x_int = variables.Variable(0, dtype=variables.dtypes.int32) boundaries, values = [-1.0, 1.0], [1, 2, 3] learning_rate_decay.piecewise_constant(x_int, boundaries, values) with self.assertRaises(ValueError): x = variables.Variable(0.0) boundaries, values = [-1.0, 1.0], [1.0, 2, 3] learning_rate_decay.piecewise_constant(x, boundaries, values)
def testPiecewiseConstantEdgeCases(self): with self.test_session(): x_int = variables.Variable(0, dtype=variables.dtypes.int32) boundaries, values = [-1.0, 1.0], [1, 2, 3] with self.assertRaises(ValueError): learning_rate_decay.piecewise_constant(x_int, boundaries, values) x = variables.Variable(0.0) boundaries, values = [-1.0, 1.0], [1.0, 2, 3] with self.assertRaises(ValueError): learning_rate_decay.piecewise_constant(x, boundaries, values)
def decay_fn(learning_rate, global_step): """The computed learning rate decay function. """ global_step = tf.to_int32(global_step) decayed_learning_rate = eval(decay_type)( learning_rate=learning_rate, global_step=tf.minimum(global_step, stop_decay_at) - start_decay_at, decay_steps=decay_steps, decay_rate=decay_rate, staircase=staircase, name="decayed_learning_rate", **kwargs) other_tensor_dict = {} if isinstance(decayed_learning_rate, tuple): decayed_learning_rate, other_tensor_dict = decayed_learning_rate final_lr = learning_rate_decay.piecewise_constant( x=global_step, boundaries=[start_decay_at], values=[learning_rate, decayed_learning_rate]) if min_learning_rate: final_lr = tf.maximum(final_lr, min_learning_rate) return final_lr, other_tensor_dict
def testPiecewiseConstantEdgeCases(self): with self.test_session(): x_int = variables.Variable(0, dtype=variables.dtypes.int32) boundaries, values = [-1.0, 1.0], [1, 2, 3] with self.assertRaises(ValueError): learning_rate_decay.piecewise_constant(x_int, boundaries, values) x = variables.Variable(0.0) boundaries, values = [-1.0, 1.0], [1.0, 2, 3] with self.assertRaises(ValueError): learning_rate_decay.piecewise_constant(x, boundaries, values) # Test that ref types are valid. x_ref = x.op.outputs[0] # float32_ref tensor should be accepted boundaries, values = [1.0, 2.0], [1, 2, 3] learning_rate_decay.piecewise_constant(x_ref, boundaries, values) # Test casting boundaries from int32 to int64. x_int64 = variables.Variable(0, dtype=variables.dtypes.int64) assign_1 = x_int64.assign(1) assign_2 = x_int64.assign(2) assign_3 = x_int64.assign(3) assign_4 = x_int64.assign(4) boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7] pc = learning_rate_decay.piecewise_constant(x_int64, boundaries, values) variables.global_variables_initializer().run() self.assertAllClose(pc.eval(), 0.4, 1e-6) assign_1.op.run() self.assertAllClose(pc.eval(), 0.4, 1e-6) assign_2.op.run() self.assertAllClose(pc.eval(), 0.5, 1e-6) assign_3.op.run() self.assertAllClose(pc.eval(), 0.6, 1e-6) assign_4.op.run() self.assertAllClose(pc.eval(), 0.7, 1e-6)
def testPiecewiseConstantEdgeCases(self): with self.test_session(): x_int = variables.Variable(0, dtype=variables.dtypes.int32) boundaries, values = [-1.0, 1.0], [1, 2, 3] with self.assertRaises(ValueError): learning_rate_decay.piecewise_constant(x_int, boundaries, values) x = variables.Variable(0.0) boundaries, values = [-1.0, 1.0], [1.0, 2, 3] with self.assertRaises(ValueError): learning_rate_decay.piecewise_constant(x, boundaries, values) # Test that ref types are valid. x_ref = x.op.outputs[0] # float32_ref tensor should be accepted boundaries, values = [1.0, 2.0], [1, 2, 3] learning_rate_decay.piecewise_constant(x_ref, boundaries, values) # Test casting boundaries from int32 to int64. x_int64 = variables.Variable(0, dtype=variables.dtypes.int64) assign_1 = x_int64.assign(1) assign_2 = x_int64.assign(2) assign_3 = x_int64.assign(3) assign_4 = x_int64.assign(4) boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7] pc = learning_rate_decay.piecewise_constant(x_int64, boundaries, values) variables.global_variables_initializer().run() self.assertAllClose(pc.eval(), 0.4, 1e-6) assign_1.op.run() self.assertAllClose(pc.eval(), 0.4, 1e-6) assign_2.op.run() self.assertAllClose(pc.eval(), 0.5, 1e-6) assign_3.op.run() self.assertAllClose(pc.eval(), 0.6, 1e-6) assign_4.op.run() self.assertAllClose(pc.eval(), 0.7, 1e-6)
def testPiecewiseConstantEdgeCases(self): x_int = resource_variable_ops.ResourceVariable( 0, dtype=variables.dtypes.int32) boundaries, values = [-1.0, 1.0], [1, 2, 3] with self.assertRaises(ValueError): decayed_lr = learning_rate_decay.piecewise_constant( x_int, boundaries, values) if context.executing_eagerly(): decayed_lr() x = resource_variable_ops.ResourceVariable(0.0) boundaries, values = [-1.0, 1.0], [1.0, 2, 3] with self.assertRaises(ValueError): decayed_lr = learning_rate_decay.piecewise_constant( x, boundaries, values) if context.executing_eagerly(): decayed_lr() # Test that ref types are valid. if not context.executing_eagerly(): x = variables.Variable(0.0) x_ref = x.op.outputs[0] # float32_ref tensor should be accepted boundaries, values = [1.0, 2.0], [1, 2, 3] learning_rate_decay.piecewise_constant(x_ref, boundaries, values) # Test casting boundaries from int32 to int64. x_int64 = resource_variable_ops.ResourceVariable( 0, dtype=variables.dtypes.int64) boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7] decayed_lr = learning_rate_decay.piecewise_constant( x_int64, boundaries, values) self.evaluate(variables.global_variables_initializer()) self.assertAllClose(self.evaluate(decayed_lr), 0.4, 1e-6) self.evaluate(x_int64.assign(1)) self.assertAllClose(self.evaluate(decayed_lr), 0.4, 1e-6) self.evaluate(x_int64.assign(2)) self.assertAllClose(self.evaluate(decayed_lr), 0.5, 1e-6) self.evaluate(x_int64.assign(3)) self.assertAllClose(self.evaluate(decayed_lr), 0.6, 1e-6) self.evaluate(x_int64.assign(4)) self.assertAllClose(self.evaluate(decayed_lr), 0.7, 1e-6)
def testPiecewiseConstantEdgeCases(self): x_int = resource_variable_ops.ResourceVariable( 0, dtype=variables.dtypes.int32) boundaries, values = [-1.0, 1.0], [1, 2, 3] with self.assertRaises(ValueError): decayed_lr = learning_rate_decay.piecewise_constant( x_int, boundaries, values) if context.executing_eagerly(): decayed_lr() x = resource_variable_ops.ResourceVariable(0.0) boundaries, values = [-1.0, 1.0], [1.0, 2, 3] with self.assertRaises(ValueError): decayed_lr = learning_rate_decay.piecewise_constant( x, boundaries, values) if context.executing_eagerly(): decayed_lr() # Test that ref types are valid. if not context.executing_eagerly(): x = variables.VariableV1(0.0) x_ref = x.op.outputs[0] # float32_ref tensor should be accepted boundaries, values = [1.0, 2.0], [1, 2, 3] learning_rate_decay.piecewise_constant(x_ref, boundaries, values) # Test casting boundaries from int32 to int64. x_int64 = resource_variable_ops.ResourceVariable( 0, dtype=variables.dtypes.int64) boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7] decayed_lr = learning_rate_decay.piecewise_constant( x_int64, boundaries, values) self.evaluate(variables.global_variables_initializer()) self.assertAllClose(self.evaluate(decayed_lr), 0.4, 1e-6) self.evaluate(x_int64.assign(1)) self.assertAllClose(self.evaluate(decayed_lr), 0.4, 1e-6) self.evaluate(x_int64.assign(2)) self.assertAllClose(self.evaluate(decayed_lr), 0.5, 1e-6) self.evaluate(x_int64.assign(3)) self.assertAllClose(self.evaluate(decayed_lr), 0.6, 1e-6) self.evaluate(x_int64.assign(4)) self.assertAllClose(self.evaluate(decayed_lr), 0.7, 1e-6)
def testPiecewiseConstant(self): x = resource_variable_ops.ResourceVariable(-999) decayed_lr = learning_rate_decay.piecewise_constant( x, [100, 110, 120], [1.0, 0.1, 0.01, 0.001]) self.evaluate(variables.global_variables_initializer()) self.assertAllClose(self.evaluate(decayed_lr), 1.0, 1e-6) self.evaluate(x.assign(100)) self.assertAllClose(self.evaluate(decayed_lr), 1.0, 1e-6) self.evaluate(x.assign(105)) self.assertAllClose(self.evaluate(decayed_lr), 0.1, 1e-6) self.evaluate(x.assign(110)) self.assertAllClose(self.evaluate(decayed_lr), 0.1, 1e-6) self.evaluate(x.assign(120)) self.assertAllClose(self.evaluate(decayed_lr), 0.01, 1e-6) self.evaluate(x.assign(999)) self.assertAllClose(self.evaluate(decayed_lr), 0.001, 1e-6)
def testPiecewiseConstant(self): x = resource_variable_ops.ResourceVariable(-999) decayed_lr = learning_rate_decay.piecewise_constant( x, [100, 110, 120], [1.0, 0.1, 0.01, 0.001]) self.evaluate(variables.global_variables_initializer()) self.assertAllClose(self.evaluate(decayed_lr), 1.0, 1e-6) self.evaluate(x.assign(100)) self.assertAllClose(self.evaluate(decayed_lr), 1.0, 1e-6) self.evaluate(x.assign(105)) self.assertAllClose(self.evaluate(decayed_lr), 0.1, 1e-6) self.evaluate(x.assign(110)) self.assertAllClose(self.evaluate(decayed_lr), 0.1, 1e-6) self.evaluate(x.assign(120)) self.assertAllClose(self.evaluate(decayed_lr), 0.01, 1e-6) self.evaluate(x.assign(999)) self.assertAllClose(self.evaluate(decayed_lr), 0.001, 1e-6)
def testPiecewiseConstantEdgeCases(self): with self.test_session(): x_int = variables.Variable(0, dtype=variables.dtypes.int32) boundaries, values = [-1.0, 1.0], [1, 2, 3] with self.assertRaises(ValueError): learning_rate_decay.piecewise_constant(x_int, boundaries, values) x = variables.Variable(0.0) boundaries, values = [-1.0, 1.0], [1.0, 2, 3] with self.assertRaises(ValueError): learning_rate_decay.piecewise_constant(x, boundaries, values) # Test that ref types are valid. x_ref = x.op.outputs[0] # float32_ref tensor should be accepted boundaries, values = [1.0, 2.0], [1, 2, 3] learning_rate_decay.piecewise_constant(x_ref, boundaries, values)
def apply_lr_decay(cfg, global_step): # Learning rate schedule if cfg.lr_decay is None: lr = cfg.lr elif cfg.lr_decay == 'exp': lr = exponential_decay(cfg.lr, global_step, cfg.decay_steps, cfg.decay_rate, staircase=cfg.staircase) elif cfg.lr_decay == 'piecewise': lr = piecewise_constant(global_step, cfg.lr_boundaries, cfg.lr_values) elif cfg.lr_decay == 'polynomial': lr = polynomial_decay(cfg.lr, global_step, cfg.decay_steps, end_learning_rate=cfg.end_lr, power=cfg.power, cycle=cfg.staircase) elif cfg.lr_decay == 'natural_exp': lr = natural_exp_decay(cfg.lr, global_step, cfg.decay_steps, cfg.decay_rate, staircase=cfg.staircase) elif cfg.lr_decay == 'inverse_time': lr = inverse_time_decay(cfg.lr, global_step, cfg.decay_steps, cfg.decay_rate, staircase=cfg.staircase) elif cfg.lr_decay == 'STN': epoch = tf.cast(global_step / cfg.decay_steps, tf.int32) lr = cfg.lr * tf.pow(0.5, tf.cast(epoch / 50, cfg._FLOATX)) else: raise NotImplementedError() return lr
def testPiecewiseConstant(self): with self.test_session(): x = variables.Variable(-999) assign_100 = x.assign(100) assign_105 = x.assign(105) assign_110 = x.assign(110) assign_120 = x.assign(120) assign_999 = x.assign(999) pc = learning_rate_decay.piecewise_constant(x, [100, 110, 120], [1.0, 0.1, 0.01, 0.001]) variables.initialize_all_variables().run() self.assertAllClose(pc.eval(), 1.0, 1e-6) assign_100.op.run() self.assertAllClose(pc.eval(), 1.0, 1e-6) assign_105.op.run() self.assertAllClose(pc.eval(), 0.1, 1e-6) assign_110.op.run() self.assertAllClose(pc.eval(), 0.1, 1e-6) assign_120.op.run() self.assertAllClose(pc.eval(), 0.01, 1e-6) assign_999.op.run() self.assertAllClose(pc.eval(), 0.001, 1e-6)
def testPiecewiseConstant(self): with self.test_session(): x = variables.Variable(-999) assign_100 = x.assign(100) assign_105 = x.assign(105) assign_110 = x.assign(110) assign_120 = x.assign(120) assign_999 = x.assign(999) pc = learning_rate_decay.piecewise_constant( x, [100, 110, 120], [1.0, 0.1, 0.01, 0.001]) variables.initialize_all_variables().run() self.assertAllClose(pc.eval(), 1.0, 1e-6) assign_100.op.run() self.assertAllClose(pc.eval(), 1.0, 1e-6) assign_105.op.run() self.assertAllClose(pc.eval(), 0.1, 1e-6) assign_110.op.run() self.assertAllClose(pc.eval(), 0.1, 1e-6) assign_120.op.run() self.assertAllClose(pc.eval(), 0.01, 1e-6) assign_999.op.run() self.assertAllClose(pc.eval(), 0.001, 1e-6)
def decay_fn(learning_rate, global_step): """The computed learning rate decay function. """ global_step = tf.to_int32(global_step) decayed_learning_rate = eval(decay_type)( learning_rate=learning_rate, global_step=tf.minimum(global_step, stop_decay_at) - start_decay_at, decay_steps=decay_steps, decay_rate=decay_rate, staircase=staircase, name="decayed_learning_rate", **kwargs) other_tensor_dict = {} if isinstance(decayed_learning_rate, tuple): decayed_learning_rate, other_tensor_dict = decayed_learning_rate final_lr = learning_rate_decay.piecewise_constant( x=global_step, boundaries=[start_decay_at], values=[learning_rate, decayed_learning_rate]) if min_learning_rate: final_lr = tf.maximum(final_lr, min_learning_rate) return final_lr, other_tensor_dict
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable( 'global_step', [], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) # global_step = tf.Variable(0, trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) print(cifar10.NUM_EPOCHS_PER_DECAY) decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) # # Decay the learning rate exponentially based on the number of steps. # lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, # global_step, # decay_steps, # cifar10.LEARNING_RATE_DECAY_FACTOR, # staircase=True) # lr manual control lr_boundaries = list() lr_values = list() for drop_no in range(1, 21): cifar10.LR_DROP_EVERY_NO_STEPS = cifar10.LR_DROP_EVERY_NO_EPOCHS * num_batches_per_epoch lr_boundary = int(drop_no * cifar10.LR_DROP_EVERY_NO_STEPS) lr_boundaries.append(lr_boundary) lr_value = cifar10.INITIAL_LEARNING_RATE / 2 ** (drop_no - 1) lr_values.append(lr_value) print(lr_boundaries) print(lr_values) # boundaries = [100000, 110000] # values = [1.0, 0.5, 0.1] # int_global_step = int(global_step) lr = learning_rate_decay.piecewise_constant(global_step, lr_boundaries, lr_values) # Create an optimizer that performs gradient descent. # opt = tf.train.GradientDescentOptimizer(lr) opt = tf.train.MomentumOptimizer(lr, cifar10.MOMENTUM) # Calculate the gradients for each model tower. tower_grads = [] for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope: # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. loss = tower_loss(scope) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / FLAGS.num_gpus format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def pc(): return learning_rate_decay.piecewise_constant(x_int64, boundaries, values)
def pc(): return learning_rate_decay.piecewise_constant(x, [100, 110, 120], [1.0, 0.1, 0.01, 0.001])
def pc(): return learning_rate_decay.piecewise_constant( x, [100, 110, 120], [1.0, 0.1, 0.01, 0.001])
def pc(): return learning_rate_decay.piecewise_constant( x_int64, boundaries, values)
def _nested_func(global_step): return learning_rate_decay.piecewise_constant(global_step, boundaries=step_list, values=values)
def train(total_loss, global_step): """Train CIFAR-10 model. Create an optimizer and apply to all trainable variables. Add moving average for all trainable variables. Args: total_loss: Total loss from loss(). global_step: Integer Variable counting the number of training steps processed. Returns: train_op: op for training. """ # Variables that affect learning rate. num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Drop learning rate every "epoch_step" epochs #if epoch % opt.epoch_step == 0 then optimState.learningRate = optimState.learningRate/2 end # Decay the learning rate exponentially based on the number of steps. # debug print("global_step: " + str(global_step)) print("NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN: " + str(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN)) print("batch size: " + str(FLAGS.batch_size)) # epochs_done = math.floor(float(global_step) * FLAGS.batch_size / NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN) # print("epochs done: " + str(epochs_done)) # # lr decay # lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, # global_step, # decay_steps, # LEARNING_RATE_DECAY_FACTOR, # staircase=True) # lr manual control lr_boundaries = list() lr_values = list() for drop_no in range(1, 21): LR_DROP_EVERY_NO_STEPS = LR_DROP_EVERY_NO_EPOCHS * num_batches_per_epoch lr_boundary = int(drop_no * LR_DROP_EVERY_NO_STEPS) lr_boundaries.append(lr_boundary) lr_value = INITIAL_LEARNING_RATE / 2 ** (drop_no - 1) lr_values.append(lr_value) print(lr_boundaries) print(lr_values) # boundaries = [100000, 110000] # values = [1.0, 0.5, 0.1] lr = learning_rate_decay.piecewise_constant(global_step, lr_boundaries, lr_values) # sess = tf.Session() # lr_val = sess.run(lr) # print("lr_val: " + lr_val) # if epochs_done % LR_DROP_EVER_NO_EPOCHS == 0: # lr = lr / LR_DROP_SCALE tf.scalar_summary('learning_rate', lr) # Generate moving averages of all losses and associated summaries. loss_averages_op = _add_loss_summaries(total_loss) # Compute gradients. with tf.control_dependencies([loss_averages_op]): # opt = tf.train.GradientDescentOptimizer(lr) opt = tf.train.MomentumOptimizer(lr, MOMENTUM) grads = opt.compute_gradients(total_loss) #debug # print("grads") # print(grads) # #debug # # Add histograms for gradients. # for grad, var in grads: # if grad is not None: # g_norm = tf.global_norm([grad]) # g_norm = tf.Print(g_norm, [g_norm], "grad norm for " + var.op.name) # tf.histogram_summary("g_norm_for_" + var.op.name, g_norm) #debug # grads_only = [g_pair[0] for g_pair in grads] # g_norm = tf.global_norm(grads_only) # g_norm = tf.Print(g_norm, [g_norm], "whole gradient norm") #debug # # just to print it mostly # tf.histogram_summary("g_norm", g_norm) # Apply gradients. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): tf.histogram_summary(var.op.name, var) # Add histograms for gradients. for grad, var in grads: if grad is not None: tf.histogram_summary(var.op.name + '/gradients', grad) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) with tf.control_dependencies([apply_gradient_op, variables_averages_op]): train_op = tf.no_op(name='train') return train_op