def testTrain(self, distribution): with distribution.scope(): mock_model = MiniModel() mock_model.call = function.defun(mock_model.call) def loss_fn(ctx): del ctx return mock_model(array_ops.ones([1, 10])) gradients_fn = backprop.implicit_grad(loss_fn) gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn) grads_and_vars = distribution.extended.call_for_each_replica( gradients_fn, args=(None,)) optimizer = gradient_descent.GradientDescentOptimizer(0.25) update_ops = optimizer._distributed_apply(distribution, grads_and_vars) # pylint: disable=protected-access if not context.executing_eagerly(): self.evaluate(variables.global_variables_initializer()) self.evaluate(update_ops) updated_var_values = self.evaluate(mock_model.variables) # All variables start at 1.0 and get two updates of 0.25. self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0]) self.assertAllEqual([0.5], updated_var_values[1])
def _test_minimize_loss_eager(self, d): with d.scope(): l = core.Dense(1, use_bias=False) def loss(x): # TODO(josh11b): What if this constant was instead a captured # value? Would it need to be a value that has been passed # through d.broadcast()? y = array_ops.reshape(l(x), []) - constant_op.constant(1.) return y * y # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a # common `implicit_grad` function and put it in DistributionStrategy. grad_fn = backprop.implicit_grad(loss) grad_fn = optimizer.get_filtered_grad_fn(grad_fn) def update(v, g): return v.assign_sub(0.2 * g) one = d.broadcast(constant_op.constant([[1.]])) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.extended.call_for_each_replica(grad_fn, args=(one, )) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.extended.read_var(v) before_list.append(fetched) # control_dependencies irrelevant but harmless in eager execution with ops.control_dependencies([fetched]): g = d.extended.reduce_to(reduce_util.ReduceOp.SUM, g, destinations=v) with ops.control_dependencies( d.extended.update(v, update, args=(g, ), group=False)): after_list.append(d.extended.read_var(v)) return before_list, after_list for i in range(10): b, a = step() if i == 0: before, = b # pylint: disable=unbalanced-tuple-unpacking after, = a # pylint: disable=unbalanced-tuple-unpacking error_before = abs(before.numpy() - 1) error_after = abs(after.numpy() - 1) # Error should go down self.assertLess(error_after, error_before)
def _test_minimize_loss_eager(self, d): with d.scope(): kernel = create_variable_like_keras_layer(name="kernel", shape=(1, 1), dtype=dtypes.float32) def loss(x): y = array_ops.reshape(gen_math_ops.mat_mul(x, kernel), []) - array_ops.identity(1.) return y * y # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a # common `implicit_grad` function and put it in DistributionStrategy. grad_fn = backprop.implicit_grad(loss) grad_fn = optimizer.get_filtered_grad_fn(grad_fn) def update(v, g): return v.assign_sub(0.2 * g) one = array_ops.identity([[1.]]) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.extended.call_for_each_replica(grad_fn, args=(one, )) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.extended.read_var(v) before_list.append(fetched) # control_dependencies irrelevant but harmless in eager execution with ops.control_dependencies([fetched]): g = d.extended.reduce_to(reduce_util.ReduceOp.SUM, g, destinations=v) with ops.control_dependencies( d.extended.update(v, update, args=(g, ), group=False)): after_list.append(d.extended.read_var(v)) return before_list, after_list for i in range(10): b, a = step() if i == 0: before, = b # pylint: disable=unbalanced-tuple-unpacking after, = a # pylint: disable=unbalanced-tuple-unpacking error_before = abs(before.numpy() - 1) error_after = abs(after.numpy() - 1) # Error should go down self.assertLess(error_after, error_before)
def step_fn(ctx, *inputs): """Function to run one iteration with one input.""" gradients_fn = backprop.implicit_grad(self._loss_fn) gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn) grads_and_vars = self.distribution.call_for_each_replica( gradients_fn, args=(ctx,) + inputs) # If threads use layers, then we need to run the first step # sequentially, so that layers.build() is not executed in parallel. # Otherwise, multiple sets of mirrored variables are going to be # created. return self._optimizer._distributed_apply( # pylint: disable=protected-access self.distribution, grads_and_vars)
def step_fn(ctx, inputs): """Function to run one iteration with one input.""" gradients_fn = backprop.implicit_grad(self._loss_fn) gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn) grads_and_vars = self.distribution.extended.call_for_each_replica( gradients_fn, args=(ctx, inputs)) # If threads use layers, then we need to run the first step # sequentially, so that layers.build() is not executed in parallel. # Otherwise, multiple sets of mirrored variables are going to be # created. return self._optimizer._distributed_apply( # pylint: disable=protected-access self.distribution, grads_and_vars)
def step(self, inputs): with self._distribution.scope(): gradients_fn = backprop.implicit_grad(self._loss_fn) gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn) grads_and_vars = self.distribution.call_for_each_tower( gradients_fn, inputs, run_concurrently=self._is_run_concurrently) # If threads use layers, then we need to run the first step sequentially, # so that layers.build() is not executed in parallel. Otherwise, multiple # sets of mirrored variables are going to be created. self._is_run_concurrently = True return self._optimizer._distributed_apply( # pylint: disable=protected-access self.distribution, grads_and_vars)
def _test_minimize_loss_eager(self, d): with d.scope(): l = core.Dense(1, use_bias=False) def loss(x): # TODO(josh11b): What if this constant was instead a captured # value? Would it need to be a value that has been passed # through d.broadcast()? y = array_ops.reshape(l(x), []) - constant_op.constant(1.) return y * y # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a # common `implicit_grad` function and put it in DistributionStrategy. grad_fn = backprop.implicit_grad(loss) grad_fn = optimizer.get_filtered_grad_fn(grad_fn) def update(v, g): return v.assign_sub(0.2 * g) one = d.broadcast(constant_op.constant([[1.]])) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.call_for_each_tower(grad_fn, one, run_concurrently=l.built) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.read_var(v) before_list.append(fetched) # control_dependencies irrelevant but harmless in eager execution with ops.control_dependencies([fetched]): g = d.reduce( variable_scope.VariableAggregation.SUM, g, destinations=v) with ops.control_dependencies(d.update( v, update, g, grouped=False)): after_list.append(d.read_var(v)) return before_list, after_list for i in range(10): b, a = step() if i == 0: before, = b # pylint: disable=unbalanced-tuple-unpacking after, = a # pylint: disable=unbalanced-tuple-unpacking error_before = abs(before.numpy() - 1) error_after = abs(after.numpy() - 1) # Error should go down self.assertLess(error_after, error_before)
def _test_minimize_loss_eager(self, d): with d.scope(): l = core.Dense(1, use_bias=False) def loss(x): y = array_ops.reshape(l(x), []) - constant_op.constant(1.) return y * y # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a # common `implicit_grad` function and put it in DistributionStrategy. grad_fn = backprop.implicit_grad(loss) grad_fn = optimizer.get_filtered_grad_fn(grad_fn) def update(v, g): return v.assign_sub(0.2 * g) one = constant_op.constant([[1.]]) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.extended.call_for_each_replica(grad_fn, args=(one,)) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.extended.read_var(v) before_list.append(fetched) # control_dependencies irrelevant but harmless in eager execution with ops.control_dependencies([fetched]): g = d.extended.reduce_to( reduce_util.ReduceOp.SUM, g, destinations=v) with ops.control_dependencies( d.extended.update(v, update, args=(g,), group=False)): after_list.append(d.extended.read_var(v)) return before_list, after_list for i in range(10): b, a = step() if i == 0: before, = b # pylint: disable=unbalanced-tuple-unpacking after, = a # pylint: disable=unbalanced-tuple-unpacking error_before = abs(before.numpy() - 1) error_after = abs(after.numpy() - 1) # Error should go down self.assertLess(error_after, error_before)
def apply_gradients(self, loss, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This is the second part of `minimize()`. It returns an `Operation` that applies gradients. Args: grads_and_vars: List of (gradient, variable) pairs as returned by `compute_gradients()`. global_step: Optional `Variable` to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the `Optimizer` constructor. Returns: An `Operation` that applies the specified gradients. If `global_step` was not None, that operation also increments `global_step`. Raises: TypeError: If `grads_and_vars` is malformed. ValueError: If none of the variables have gradients. RuntimeError: If you should use `_distributed_apply()` instead. """ # This is a default implementation of apply_gradients() that can be shared # by most optimizers. It relies on the subclass implementing the following # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse(). # Handle DistributionStrategy case. if distribution_strategy_context.get_cross_tower_context(): raise RuntimeError("Use `_distributed_apply()` instead of `apply_gradients()` in a cross-tower context.") # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by # always calling _distributed_apply(), using the default distribution # as needed. if distribution_strategy_context.has_distribution_strategy(): grads_and_vars = optimizer.get_filtered_grad_fn(lambda: grads_and_vars)() return distribution_strategy_context.get_tower_context().merge_call( self._distributed_apply, grads_and_vars, global_step, name ) # No DistributionStrategy case. grads_and_vars = tuple(grads_and_vars) # Make sure repeat iteration works. if not grads_and_vars: raise ValueError("No variables provided.") converted_grads_and_vars = [] for grad, var in grads_and_vars: if grad is not None: try: # Convert the grad to Tensor or IndexedSlices if necessary. grad = ops.convert_to_tensor_or_indexed_slices(grad) except TypeError: raise TypeError("Gradient must be convertible to a Tensor or IndexedSlices, or None: %s" % grad) if not isinstance(grad, (ops.Tensor, ops.IndexedSlices)): raise TypeError("Gradient must be a Tensor, IndexedSlices, or None: %s" % grad) processor = _get_processor(var) converted_grads_and_vars.append((grad, var, processor)) converted_grads_and_vars = tuple(converted_grads_and_vars) var_list = [var for grad, var, _ in converted_grads_and_vars if grad is not None] if not var_list: raise ValueError("No gradients provided for any variable: %s." % ([str(var) for _, var, _ in converted_grads_and_vars],)) with ops.init_scope(): self._create_slots(var_list) update_ops = [] with ops.name_scope(name, self._name) as name: self._prepare() for grad, var, processor in converted_grads_and_vars: if grad is None: continue # We colocate all ops created in _apply_dense or _apply_sparse # on the same device as the variable. # TODO(apassos): figure out how to get the variable name here. if context.executing_eagerly() or isinstance(var, resource_variable_ops.ResourceVariable) and not var._in_graph_mode: scope_name = "" else: scope_name = var.op.name with ops.name_scope("update_" + scope_name), ops.colocate_with(var): update_ops.append(processor.update_op(self, loss, grad, global_step)) if global_step is None: apply_updates = self._finish(update_ops, loss, name) else: with ops.control_dependencies([self._finish(update_ops, loss, "update")]): with ops.colocate_with(global_step): if isinstance(global_step, resource_variable_ops.ResourceVariable): # TODO(apassos): the implicit read in assign_add is slow; consider # making it less so. apply_updates = resource_variable_ops.assign_add_variable_op( resource=global_step.handle, value=ops.convert_to_tensor( value=1, dtype=global_step.dtype ), name=name ) else: apply_updates = state_ops.assign_add( ref=global_step, value=1, name=name ) if not context.executing_eagerly(): if isinstance(apply_updates, ops.Tensor): apply_updates = apply_updates.op train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP) if apply_updates not in train_op: train_op.append(apply_updates) return apply_updates