def _grad_fn(ys, xs, args, func_graph): """Computes the gradient of `func_graph` in the current graph. This function builds the gradient graph of the corresponding forward-pass `func_graph` by differentiating `func_graph`'s outputs w.r.t. its inputs. Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. args: The input arguments. args[0] - Loop counter args[1] - Total number of iterations. args[2:] - Incoming gradients for `ys`. func_graph: function.FuncGraph. The corresponding forward-pass function. Returns: The output gradient Tensors. """ grad_ys = args[2:] # Build the gradient graph. Note that this builds the gradient computation of # func_graph in the current graph, which requires capturing tensors from # func_graph. The captured func_graph tensors are resolved to external tensors # after the forward While op has been rewritten in _resolve_grad_captures. # TODO(srbs): Mark GradientsHelper as public? grad_outs = gradients_util._GradientsHelper( ys, xs, grad_ys=grad_ys, src_graph=func_graph, unconnected_gradients="zero") # TODO(b/118712257): Handle the case when grad_outs has None's e.g. when there # is a tf.StopGradient in the loop body. assert all(g is not None for g in grad_outs) counter = args[0] total_iters = args[1] return [counter + 1, total_iters] + grad_outs
def _grad_fn(func_graph, grads): """The gradient function for each conditional branch. This function builds the gradient graph of the corresponding forward-pass conditional branch in `func_graph`. This is done by differentiating func_graph's outputs w.r.t. its inputs. Args: func_graph: FuncGraph. The corresponding forward-pass function. grads: The list of input gradient Tensors. Returns: The output gradient Tensors. """ # Filter out untrainable function outputs. # NOTE(skyewm): If we don't do this, the untrainable tensors can sometimes # cause _GradientsHelper to raise an exception (e.g. the implementation # doesn't expect 'ys' to contain boolean tensors). assert len(func_graph.outputs) == len(grads) ys = [] grad_ys = [] for y, grad_y in zip(func_graph.outputs, grads): if not backprop_util.IsTrainable(y): continue ys.append(y) grad_ys.append(grad_y) # Build the gradient graph. Note that this builds the gradient computation of # func_graph in the current graph, which requires capturing tensors from # func_graph. The captured func_graph tensors are resolved to external tensors # in _resolve_grad_inputs. result = gradients_util._GradientsHelper(ys, func_graph.inputs, grad_ys=grad_ys, src_graph=func_graph) # Functions can't return None; replace Nones with zero tensors. # TODO(b/80444525): don't return anything here and make _IfGrad return None if # both branches have zero gradient. for i in range(len(result)): if result[i] is None: if func_graph.inputs[i].dtype == dtypes.resource: result[i] = array_ops.zeros( gen_resource_variable_ops.variable_shape( func_graph.inputs[i]), dtype=default_gradient.get_zeros_dtype( func_graph.inputs[i])) else: result[i] = array_ops.zeros_like(func_graph.inputs[i]) return result
def _grad_fn(func_graph, grads): """The gradient function for each conditional branch. This function builds the gradient graph of the corresponding forward-pass conditional branch in `func_graph`. This is done by differentiating func_graph's outputs w.r.t. its inputs. Args: func_graph: FuncGraph. The corresponding forward-pass function. grads: The list of input gradient Tensors. Returns: The output gradient Tensors. """ # Filter out untrainable function outputs. # NOTE(skyewm): If we don't do this, the untrainable tensors can sometimes # cause _GradientsHelper to raise an exception (e.g. the implementation # doesn't expect 'ys' to contain boolean tensors). assert len(func_graph.outputs) == len(grads) ys = [] grad_ys = [] for y, grad_y in zip(func_graph.outputs, grads): if not gradients_util.IsTrainable(y): continue ys.append(y) grad_ys.append(grad_y) # Build the gradient graph. Note that this builds the gradient computation of # func_graph in the current graph, which requires capturing tensors from # func_graph. The captured func_graph tensors are resolved to external tensors # in _resolve_grad_inputs. result = gradients_util._GradientsHelper( ys, func_graph.inputs, grad_ys=grad_ys, src_graph=func_graph) # Functions can't return None; replace Nones with zero tensors. # TODO(b/80444525): don't return anything here and make _IfGrad return None if # both branches have zero gradient. for i in range(len(result)): if result[i] is None: if func_graph.inputs[i].dtype == dtypes.resource: result[i] = array_ops.zeros( gen_resource_variable_ops.variable_shape(func_graph.inputs[i])) else: result[i] = array_ops.zeros_like(func_graph.inputs[i]) return result
def _grad_fn(ys, xs, args, func_graph): """Computes the gradient of `func_graph` in the current graph. This function builds the gradient graph of the corresponding forward-pass `func_graph` by differentiating `func_graph`'s outputs w.r.t. its inputs. Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. args: The input arguments. args[0] - Loop counter args[1] - Total number of iterations. args[2] - maximum_iterations. args[3:] - Incoming gradients for `ys`. func_graph: function.FuncGraph. The corresponding forward-pass function. Returns: The output gradient Tensors. """ grad_ys = args[3:] # Build the gradient graph. Note that this builds the gradient computation of # func_graph in the current graph, which requires capturing tensors from # func_graph. The captured func_graph tensors are resolved to external tensors # after the forward While op has been rewritten in _resolve_grad_captures. # TODO(srbs): Mark GradientsHelper as public? grad_outs = gradients_util._GradientsHelper(ys, xs, grad_ys=grad_ys, src_graph=func_graph, unconnected_gradients="zero") # TODO(b/118712257): Handle the case when grad_outs has None's e.g. when there # is a tf.StopGradient in the loop body. assert all(g is not None for g in grad_outs) counter = args[0] maximum_iterations = args[1] total_iters = args[2] return [counter + 1, maximum_iterations, total_iters] + grad_outs
def _grad_fn(func_graph, grads): """The gradient function for each conditional branch. This function builds the gradient graph of the corresponding forward-pass conditional branch in `func_graph`. This is done by differentiating func_graph's outputs w.r.t. its inputs. Args: func_graph: FuncGraph. The corresponding forward-pass function. grads: The list of input gradient Tensors. Returns: The output gradient Tensors. """ # Filter out untrainable function outputs. # NOTE(skyewm): If we don't do this, the untrainable tensors can sometimes # cause _GradientsHelper to raise an exception (e.g. the implementation # doesn't expect 'ys' to contain boolean tensors). assert len(func_graph.outputs) == len(grads) ys = [] grad_ys = [] for y, grad_y in zip(func_graph.outputs, grads): if not backprop_util.IsTrainable(y): continue ys.append(y) grad_ys.append(grad_y) # Build the gradient graph. Note that this builds the gradient computation of # func_graph in the current graph, which requires capturing tensors from # func_graph. The captured func_graph tensors are resolved to external tensors # in _resolve_grad_inputs. result = gradients_util._GradientsHelper(ys, func_graph.inputs, grad_ys=grad_ys, src_graph=func_graph) return result
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None, stop_gradients=None, unconnected_gradients=UnconnectedGradients.NONE): """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). `stop_gradients` is a `Tensor` or a list of tensors to be considered constant with respect to all `xs`. These tensors will not be backpropagated through, as though they had been explicitly disconnected using `stop_gradient`. Among other things, this allows computation of partial derivatives as opposed to total derivatives. For example: ```python a = tf.constant(0.) b = 2 * a g = tf.gradients(a + b, [a, b], stop_gradients=[a, b]) ``` Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the total derivatives `tf.gradients(a + b, [a, b])`, which take into account the influence of `a` on `b` and evaluate to `[3.0, 1.0]`. Note that the above is equivalent to: ```python a = tf.stop_gradient(tf.constant(0.)) b = tf.stop_gradient(2 * a) g = tf.gradients(a + b, [a, b]) ``` `stop_gradients` provides a way of stopping gradient after the graph has already been constructed, as compared to `tf.stop_gradient` which is used during graph construction. When the two approaches are combined, backpropagation stops at both `tf.stop_gradient` nodes and nodes in `stop_gradients`, whichever is encountered first. All integer tensors are considered constant with respect to all `xs`, as if they were included in `stop_gradients`. `unconnected_gradients` determines the value returned for each x in xs if it is unconnected in the graph to ys. By default this is None to safeguard against errors. MAthematically these gradients are zero which can be requested using the `'zero'` option. `tf.UnconnectedGradients` provides the following options and behaviors: ```python a = tf.ones([1, 2]) b = tf.ones([3, 1]) g1 = tf.gradients([b], [a], unnconnected_gradients='none') sess.run(g1) # [None] g2 = tf.gradients([b], [a], unconnected_gradients='zero') sess.run(g2) # [array([[0., 0.]], dtype=float32)] ``` Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate through. unconnected_gradients: Optional. Specifies the gradient value returned when the given input tensors are unconnected. Accepted values are constants defined in the class `tf.UnconnectedGradients` and the default value is `none`. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. RuntimeError: if called in Eager mode. """ # Creating the gradient graph for control flow mutates Operations. # _mutation_lock ensures a Session.run call cannot occur between creating and # mutating new ops. # pylint: disable=protected-access with ops.get_default_graph()._mutation_lock(): return gradients_util._GradientsHelper( ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients)
def gradients_v2( ys, # pylint: disable=invalid-name xs, grad_ys=None, name="gradients", gate_gradients=False, aggregation_method=None, stop_gradients=None, unconnected_gradients=UnconnectedGradients.NONE): """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`. `tf.gradients` is only valid in a graph context. In particular, it is valid in the context of a `tf.function` wrapper, where code is executing as a graph. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys` and for x in `xs`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). `stop_gradients` is a `Tensor` or a list of tensors to be considered constant with respect to all `xs`. These tensors will not be backpropagated through, as though they had been explicitly disconnected using `stop_gradient`. Among other things, this allows computation of partial derivatives as opposed to total derivatives. For example: >>> @tf.function ... def example(): ... a = tf.constant(0.) ... b = 2 * a ... return tf.gradients(a + b, [a, b], stop_gradients=[a, b]) >>> example() [<tf.Tensor: shape=(), dtype=float32, numpy=1.0>, <tf.Tensor: shape=(), dtype=float32, numpy=1.0>] Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the total derivatives `tf.gradients(a + b, [a, b])`, which take into account the influence of `a` on `b` and evaluate to `[3.0, 1.0]`. Note that the above is equivalent to: >>> @tf.function ... def example(): ... a = tf.stop_gradient(tf.constant(0.)) ... b = tf.stop_gradient(2 * a) ... return tf.gradients(a + b, [a, b]) >>> example() [<tf.Tensor: shape=(), dtype=float32, numpy=1.0>, <tf.Tensor: shape=(), dtype=float32, numpy=1.0>] `stop_gradients` provides a way of stopping gradient after the graph has already been constructed, as compared to `tf.stop_gradient` which is used during graph construction. When the two approaches are combined, backpropagation stops at both `tf.stop_gradient` nodes and nodes in `stop_gradients`, whichever is encountered first. All integer tensors are considered constant with respect to all `xs`, as if they were included in `stop_gradients`. `unconnected_gradients` determines the value returned for each x in xs if it is unconnected in the graph to ys. By default this is None to safeguard against errors. Mathematically these gradients are zero which can be requested using the `'zero'` option. `tf.UnconnectedGradients` provides the following options and behaviors: >>> @tf.function ... def example(use_zero): ... a = tf.ones([1, 2]) ... b = tf.ones([3, 1]) ... if use_zero: ... return tf.gradients([b], [a], unconnected_gradients='zero') ... else: ... return tf.gradients([b], [a], unconnected_gradients='none') >>> example(False) [None] >>> example(True) [<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0., 0.]], ...)>] Let us take one practical example which comes during the back propogation phase. This function is used to evaluate the derivatives of the cost function with respect to Weights `Ws` and Biases `bs`. Below sample implementation provides the exaplantion of what it is actually used for : >>> @tf.function ... def example(): ... Ws = tf.constant(0.) ... bs = 2 * Ws ... cost = Ws + bs # This is just an example. Please ignore the formulas. ... g = tf.gradients(cost, [Ws, bs]) ... dCost_dW, dCost_db = g ... return dCost_dW, dCost_db >>> example() (<tf.Tensor: shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: shape=(), dtype=float32, numpy=1.0>) Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate through. unconnected_gradients: Optional. Specifies the gradient value returned when the given input tensors are unconnected. Accepted values are constants defined in the class `tf.UnconnectedGradients` and the default value is `none`. Returns: A list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys` and for x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. RuntimeError: if called in Eager mode. """ # Creating the gradient graph for control flow mutates Operations. # _mutation_lock ensures a Session.run call cannot occur between creating and # mutating new ops. # pylint: disable=protected-access with ops.get_default_graph()._mutation_lock(): return gradients_util._GradientsHelper(ys, xs, grad_ys, name, True, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients)
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None, stop_gradients=None, unconnected_gradients=UnconnectedGradients.NONE): """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). `stop_gradients` is a `Tensor` or a list of tensors to be considered constant with respect to all `xs`. These tensors will not be backpropagated through, as though they had been explicitly disconnected using `stop_gradient`. Among other things, this allows computation of partial derivatives as opposed to total derivatives. For example: ```python a = tf.constant(0.) b = 2 * a g = tf.gradients(a + b, [a, b], stop_gradients=[a, b]) ``` Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the total derivatives `tf.gradients(a + b, [a, b])`, which take into account the influence of `a` on `b` and evaluate to `[3.0, 1.0]`. Note that the above is equivalent to: ```python a = tf.stop_gradient(tf.constant(0.)) b = tf.stop_gradient(2 * a) g = tf.gradients(a + b, [a, b]) ``` `stop_gradients` provides a way of stopping gradient after the graph has already been constructed, as compared to `tf.stop_gradient` which is used during graph construction. When the two approaches are combined, backpropagation stops at both `tf.stop_gradient` nodes and nodes in `stop_gradients`, whichever is encountered first. All integer tensors are considered constant with respect to all `xs`, as if they were included in `stop_gradients`. `unconnected_gradients` determines the value returned for each x in xs if it is unconnected in the graph to ys. By default this is None to safeguard against errors. MAthematically these gradients are zero which can be requested using the `'zero'` option. `tf.UnconnectedGradients` provides the following options and behaviors: ```python a = tf.ones([1, 2]) b = tf.ones([3, 1]) g1 = tf.gradients([b], [a], unnconnected_gradients='none') sess.run(g1) # [None] g2 = tf.gradients([b], [a], unconnected_gradients='zero') sess.run(g2) # [array([[0., 0.]], dtype=float32)] ``` Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate through. unconnected_gradients: Optional. Specifies the gradient value returned when the given input tensors are unconnected. Accepted values are constants defined in the class `tf.UnconnectedGradients` and the default value is `none`. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. RuntimeError: if called in Eager mode. """ # Creating the gradient graph for control flow mutates Operations. # _mutation_lock ensures a Session.run call cannot occur between creating and # mutating new ops. # pylint: disable=protected-access with ops.get_default_graph()._mutation_lock(): return gradients_util._GradientsHelper( ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients)