def append_apply_gradients_ops(self, gradient_state, opt, grads, training_ops, loss_scale_params): """Adds training ops for grads to 'training_ops'. Args: gradient_state: from previous call to apply_gradients_devices. opt: the underlying optimizer grads: [(grad, var)] to apply training_ops: list to which to add ops loss_scale_params: parameters for loss scaling. """ del gradient_state # unused by this implementation def get_apply_gradients_ops_func(): """Returns the apply_gradients op.""" return [opt.apply_gradients(grads)] variable_mgr_util.append_gradients_with_loss_scale( training_ops, get_apply_gradients_ops_func, loss_scale_params, self.grad_has_inf_nan)
def testAppendGradientsWithLossScaleForNonChiefWorker(self): v = tf.Variable(0) training_ops = [] get_apply_gradients_ops_func = lambda: [tf.assign(v, v + 1)] loss_scale_params = variable_mgr_util.AutoLossScaleParams( enable_auto_loss_scale=True, loss_scale=tf.Variable(4), loss_scale_normal_steps=tf.Variable(10), inc_loss_scale_every_n=10, is_chief=False) # Non-chief variable_mgr_util.append_gradients_with_loss_scale( training_ops, get_apply_gradients_ops_func, loss_scale_params, grad_has_inf_nan=False) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) sess.run(training_ops) self.assertEqual(sess.run(v), 1) self.assertEqual(sess.run(loss_scale_params.loss_scale), 4) self.assertEqual( sess.run(loss_scale_params.loss_scale_normal_steps), 10)
def testAppendGradientsWithLossScaleWithoutNan(self): v = tf.Variable(0) training_ops = [] get_apply_gradients_ops_func = lambda: [tf.assign(v, v + 1)] loss_scale_params = variable_mgr_util.AutoLossScaleParams( enable_auto_loss_scale=True, loss_scale=tf.Variable(4, dtype=tf.float32), loss_scale_normal_steps=tf.Variable(10), inc_loss_scale_every_n=10, is_chief=True) variable_mgr_util.append_gradients_with_loss_scale( training_ops, get_apply_gradients_ops_func, loss_scale_params, grad_has_inf_nan=tf.constant(False)) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) sess.run(training_ops) self.assertEqual(sess.run(v), 1) self.assertEqual(sess.run(loss_scale_params.loss_scale), 8) self.assertEqual( sess.run(loss_scale_params.loss_scale_normal_steps), 0)
def append_apply_gradients_ops(self, gradient_state, opt, grads, training_ops, loss_scale_params): device_grads = gradient_state # From 2nd result of preprocess_device_grads. def get_apply_gradients_ops_func(): """Returns a list of ops for updating gradients.""" apply_gradients_ops = [] # For each variable, apply the combined gradients for this server on # the parameter server, and then wait for all other servers to do this. for i, (g, v) in enumerate(grads): apply_gradient_op = opt.apply_gradients([(g, v)]) barrier = self.benchmark_cnn.add_sync_queues_and_barrier( 'replicate_variable_%s' % i, [apply_gradient_op]) with tf.control_dependencies([barrier]): with tf.device(self.benchmark_cnn.cpu_device): updated_value = v.read_value() for my_d in range(len(self.benchmark_cnn.devices)): apply_gradients_ops.append( device_grads[my_d][i][1].assign(updated_value)) return apply_gradients_ops variable_mgr_util.append_gradients_with_loss_scale( training_ops, get_apply_gradients_ops_func, loss_scale_params, self.grad_has_inf_nan)