def testGradientFunction(self): # Input to tf.py_func is necessary, otherwise get_gradient_function() # returns None per default. a = constant_op.constant(0) x, = script_ops.py_func(lambda a: 0, [a], [dtypes.int64]) y, = script_ops.py_func(lambda a: 0, [a], [dtypes.int64], stateful=False) self.assertEqual(None, ops.get_gradient_function(x.op)) self.assertEqual(None, ops.get_gradient_function(y.op))
def testGradientFunction(self): # Input to tf.py_func is necessary, otherwise get_gradient_function() # returns None per default. a = tf.constant(0) x, = tf.py_func(lambda a: 0, [a], [tf.int64]) y, = tf.py_func(lambda a: 0, [a], [tf.int64], stateful=False) self.assertEqual(None, ops.get_gradient_function(x.op)) self.assertEqual(None, ops.get_gradient_function(y.op))
def testGradientFunction(self): # Input to tf.compat.v1.py_func is necessary, # otherwise get_gradient_function() returns None per default. a = constant_op.constant(0) x, = script_ops.py_func(lambda a: 0, [a], [dtypes.int64]) y, = script_ops.py_func(lambda a: 0, [a], [dtypes.int64], stateful=False) self.assertEqual(None, ops.get_gradient_function(x.op)) self.assertEqual(None, ops.get_gradient_function(y.op))
def get_grad_results(max_val, pointwise_op, in_size): """ Construct and run a Tensorflow graph to compute a backprop pointwise op. Will create an input tensor of the required size filled with values -n, -n+1, ..., 0, 1, ..., n-1, n and use these to compute the pointwise op. Then, create another tensor with the same values to use as the errors for back-propagation. Returns the computed values in a numpy array. """ with tf.Graph().as_default(): min_val = -max_val if in_size % 2 == 0 else -max_val - 1 input_vals = helpers.get_signed_tensor_data(in_size, max_val=max_val, min_val=min_val) inp_tensor = tf.constant(input_vals, dtype=np.float64) pointwise_output = pointwise_op(inp_tensor, name='pointwise') tf_op = tf.get_default_graph().get_operation_by_name('pointwise') grad_fn = get_gradient_function(tf_op) output_size = in_size error_vals = helpers.get_signed_tensor_data(output_size, max_val=max_val, min_val=min_val) error_tensor = tf.constant(error_vals, dtype=np.float64) output = grad_fn(tf_op, error_tensor) with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) sess.graph.finalize() return sess.run(output)
def find_non_differentiable(inputs, outputs): """ Searches through a TensorFlow graph to find non-differentiable elements between ``inputs`` and ``outputs`` (elements that would prevent us from computing ``d_outputs / d_inputs``. Parameters ---------- inputs : list of ``tf.Tensor`` Input tensors outputs : list of ``tf.Tensor`` Output tensors """ for o in outputs: if o in inputs: continue else: try: grad = get_gradient_function(o.op) if grad is None and len(o.op.inputs) > 0: # note: technically we're not sure that this op is # on the path to inputs. we could wait and propagate this # until we find inputs, but that can take a long time for # large graphs. it seems more useful to fail quickly, and # risk some false positives raise LookupError find_non_differentiable(inputs, o.op.inputs) except LookupError: raise SimulationError( "Graph contains non-differentiable " "elements: %s" % o.op)
def testNonExistentOverride(self): g = ops.Graph() x = an_op(g) with g.gradient_override_map({"copy": "unknown_override"}): y = copy_op(x) with self.assertRaisesRegexp(LookupError, "unknown_override"): fn = ops.get_gradient_function(y.op)
def testNonExistentOverride(self): g = ops.Graph() x = an_op(g) with g.gradient_override_map({"copy": "unknown_override"}): y = copy_op(x) with self.assertRaisesRegexp(LookupError, "unknown_override"): fn = ops.get_gradient_function(y.op)
def testOverrideGradients(self): g = ops.Graph() x = an_op(g) with g.gradient_override_map({"copy": "copy_override"}): y = copy_op(x) fn = ops.get_gradient_function(y.op) self.assertEqual(_CopyOverrideGrad, fn)
def go(): dense = tf.Variable([[0, 0, 10, 1, 0, 0], [0, 0, -2, 3, 0, 0]], dtype=tf.float32) sm1 = tf.nn.softmax(dense) denseReplacing0WithNeg10 = tf.where( dense > 0.0, dense, tf.ones(tf.shape(dense), tf.float32) * (-10.0)) sm2 = tf.nn.softmax(denseReplacing0WithNeg10) nz_indices = tf.where(tf.not_equal(dense, tf.constant(0, dtype=tf.float32))) nz_values = tf.gather_nd(dense, nz_indices) sparse = tf.SparseTensor(nz_indices, nz_values, dense.get_shape()) sm3 = tf.sparse_softmax(sparse) dm3a = tf.sparse_to_dense(sm3.indices, sm3.get_shape(), sm3.values) dm3b = tf.scatter_nd(sm3.indices, sm3.values, dense.get_shape()) session = tf.Session() session.run(tf.global_variables_initializer()) from tensorflow.python.framework import ops for v in nz_indices, nz_values, sparse, sm3, dm3a, dm3b: print 'gradient of op', v, ops.get_gradient_function(v.op) print 'dense sm - direct', session.run(sm1) print 'dense sm - with -10 trick', session.run(sm2) print 'sparse sm', session.run(sm3) print 'densified sparse sm - old', session.run(dm3a) print 'densified sparse sm - new', session.run(dm3a)
def find_non_differentiable(inputs, outputs): """Searches through a Tensorflow graph to find non-differentiable elements between ``inputs`` and ``outputs`` (elements that would prevent us from computing ``d_outputs / d_inputs``. Parameters ---------- inputs : list of ``tf.Tensor`` input tensors outputs : list of ``tf.Tensor`` output tensors """ for o in outputs: if o in inputs: continue else: try: grad = get_gradient_function(o.op) if grad is None and len(o.op.inputs) > 0: # note: technically we're not sure that this op is # on the path to inputs. we could wait and propagate this # until we find inputs, but that can take a long time for # large graphs. it seems more useful to fail quickly, and # risk some false positives raise LookupError find_non_differentiable(inputs, o.op.inputs) except LookupError: raise SimulationError("Graph contains non-differentiable " "elements: %s" % o.op)
def pyfunc_test(): # create data x_data = tf.placeholder(dtype=tf.float32, shape=[None]) y_data = tf.placeholder(dtype=tf.float32, shape=[None]) w = tf.Variable(tf.constant([0.5])) b = tf.Variable(tf.zeros([1])) y1 = tf.multiply(w, x_data, name='y1') y2 = py_func(addone, [y1], [tf.float32], grad=addone_grad)[0] y = tf.add(y2, b) loss = tf.reduce_mean(tf.square(y - y_data)) optimizer = tf.train.GradientDescentOptimizer(0.01) train = optimizer.minimize(loss) print("Pyfunc grad", ops.get_gradient_function(y2.op)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for step in range(10): # ran = np.random.rand(115).astype(np.float32) ran = np.ones((115)).astype(np.float32) ans = ran * 1.5 + 3 dic = {x_data: ran, y_data: ans} tt, yy, yy1 = sess.run([train, y1, y2], feed_dict=dic) if step % 1 == 0: print('step {}'.format(step)) print('{}, {}'.format(w.eval(), b.eval())) test = sess.run(y, feed_dict={x_data: [1]}) print('test = {}'.format(test))
def testOverrideGradients(self): g = ops.Graph() x = an_op(g) with g.gradient_override_map({"copy": "copy_override"}): y = copy_op(x) fn = ops.get_gradient_function(y.op) self.assertEqual(_CopyOverrideGrad, fn)
def _Gradient(tensors, devices): reduce_tensors, _ = nccl_reduce(tensors, devices) tensor_ops = [t.op for t in reduce_tensors] d_tensors = _DeviceTensors(tensors, devices) grad_tensors = [ ops.get_gradient_function(op)(op, loss) for op, loss in zip(tensor_ops, d_tensors) ] return grad_tensors, []
def _Gradient(tensors, devices): reduce_tensors, _ = nccl_reduce(tensors, devices) tensor_ops = [t.op for t in reduce_tensors] d_tensors = _DeviceTensors(tensors, devices) grad_tensors = [ ops.get_gradient_function(op)(op, loss) for op, loss in zip(tensor_ops, d_tensors) ] return grad_tensors, []
def _lstm_grad_op(session, verbose=True): """ :param tf.Session session: :return: grad function """ lstm_grad_op = find_op_by_type(session=session, type_name="LstmGenericBase") assert lstm_grad_op is not None if verbose: print("op:", lstm_grad_op) from tensorflow.python.framework import ops grad_func = ops.get_gradient_function(lstm_grad_op) if verbose: print("grad_func:", grad_func) grad_op = grad_func.grad_op if verbose: print("grad_op:", grad_op, grad_op.__doc__) return grad_op
def show_parent_of_tensor(tensor, prefix=""): global print_ops_printed for opr in active_session.graph.get_operations(): if tensor in opr.outputs: # if the parent operating is Indentity then leapfrog this to it's parent if opr.type == "Identity": show_parent_of_tensor(opr.inputs[0], prefix) else: # not an Identity op so recurse normally # if this op has inputs then find if this operation has a gradients function grad_string = '' if len(opr.inputs) > 0: grad_string = " \033[92mGrads\033[0m" try: if get_gradient_function(opr) is None: grad_string = ' \033[91mNo Grads\033[0m' except LookupError: grad_string = ' \033[91mlookup Error\033[0m' print("%s [\033[35m%s\033[0m \033[34m\"%s\"\033[0m] %s" % (prefix, opr.type, opr.node_def.name, grad_string)) # test if this operation has already been printed as prior to a previous op if opr in print_ops_printed: print("%s . . ." % prefix) return opr print_ops_printed += [opr] for i in range(len(opr.inputs)): parent_tensor = opr.inputs[i] print( "%s |<\"%s\" with size %s and type %s>" % (prefix, parent_tensor.name, str(parent_tensor.shape), str(parent_tensor.dtype.base_dtype))) if i == len(opr.inputs) - 1: show_parent_of_tensor(parent_tensor, (prefix + " ")) else: show_parent_of_tensor(parent_tensor, (prefix + " |")) return opr
def get_grad_results(max_val, pool_op, input_shape, window_shape, stride_shape, padding): """ Construct and run a Tensorflow graph to compute pooling backprop values. Will create an input tensor of the required size filled with values 1, 2, 3... and use these to compute the pooling, then create another tensor with the same values to use as the errors to back-propagate. Returns the computed values in a numpy array. """ with tf.Graph().as_default(): total_inp_size = np.product(input_shape) input_vals = helpers.get_tensor_data(total_inp_size, max_val) inp_tensor = tf.constant(input_vals, shape=input_shape, dtype=np.float64) pool_output = tf.nn.pool(inp_tensor, window_shape=window_shape, pooling_type=TF_OPERATOR_MAP[pool_op], strides=stride_shape, padding=padding, name='pool', data_format="NHWC") tf_op = tf.get_default_graph().get_operation_by_name('pool') grad_fn = get_gradient_function(tf_op) output_shape = pool_output.shape total_out_size = np.product(output_shape) error_vals = helpers.get_tensor_data(total_out_size, max_val) error_tensor = tf.constant(error_vals, shape=output_shape, dtype=np.float64) output = grad_fn(tf_op, error_tensor) with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) sess.graph.finalize() return sess.run(output)
def gradients(ys, xs, grad_ys=None, stop_grads=None, group_aggregations=8, custom_matmul_grad=True): if group_aggregations > 8 or group_aggregations < 1: raise ValueError( "gradients: group_aggregation sizes of 1-8 supported.") ys = _AsList(ys) xs = [x.value() if isinstance(x, tf.Variable) else x for x in _AsList(xs)] stop_grads = [] if stop_grads is None else _AsList(stop_grads) grad_ys = [None] * len(ys) if grad_ys is None else _AsList(grad_ys) assert len(ys) == len(grad_ys) with ops.name_scope("gradients"): for i, dy in enumerate(grad_ys): if dy is None: # float grads start at ones by default grad_ys[i] = tf.fill( tf.shape(ys[i]), tf.constant(1.0, dtype=ys[i].dtype, name=f"grad_ys_{i}")) ys_ops = [t.op for t in ys] xs_ops = [t.op for t in xs] pending_count, reachable_ys_ops, recompute_ops = _PendingCount( ys_ops, xs_ops) # The set of ops that terminate the gradient computation. # Confirm that our xs tensors are just endpoints in the graph. # Also set any externally provided stop grad ops. stop_ops = set(t.op for t in stop_grads) for op in xs_ops: is_stop_op = True for x in op.inputs: if x.op in pending_count: is_stop_op = False break if is_stop_op: stop_ops.add(op) # Each op output has an associated list of gradient inputs # If more than one, these need to be accumulated. # Add the initial gradients for the ys. grads = dict() for y, dy in zip(ys, grad_ys): _SetGrad(grads, y, dy) # Add the unique ys ops that are ready into the queue. queue = collections.deque() for op in reachable_ys_ops: # an op is ready if it has no dependecies if op not in pending_count: queue.append(op) while queue: op = queue.popleft() # only pending_count==0 ops are in the queue so all grad input lists are fully populated # go ahead and apply any needed add_n ops to these lists. dys = _AggregatedGrads(grads, op, group_aggregations) # confirm that we have at least one tensor to compute and that this isn't a stop grad op if any(dy is not None for dy in dys) and op not in stop_ops: # get the grad function for this op try: if custom_matmul_grad and op.type == "MatMul" and not op.get_attr( "transpose_a") and not op.get_attr("transpose_b"): grad_fn = _MatMulGradNN else: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( f"No gradient defined for operation '{op.name}' (op type: {op.type})" ) # for any missing input grads, build a zero input of the right dtype/shape for i, dy in enumerate(dys): if dy is None: dys[i] = tf.zeros_like(op.outputs[i]) # call the grad function with the forward op node and list of grad inputs with ops.name_scope(op.name + "_grad"): dxs = _AsList(grad_fn(op, *dys)) if len(dxs) != len(op.inputs): raise ValueError( f"Num gradients {len(dxs)} generated for op {op.node_def} do not match num inputs {len(op.inputs)}" ) #_LogOpGradients(op, dys, dxs) else: dxs = [None] * len(op.inputs) for i, (x, dx) in enumerate(zip(op.inputs, dxs)): if dx is not None: # force unsorted_segment_sum call if isinstance(dx, ops.IndexedSlices): dx = tf.convert_to_tensor(dx) #dx = emb.embedding_lookup_grad_op(dx.values, dx.indices, dx.dense_shape[0]) # do some shape sanity checking try: dx.set_shape(x.shape) except ValueError: raise ValueError( "Incompatible shapes between op input {x.shape} and calculated input gradient {dx.shape} for {op.name} (idx:{i})" ) # update the input grad list for the consumer of this gradient _SetGrad(grads, x, dx) # Update pending count for the inputs of op and enqueue any ready ops for x in op.inputs: # only traverse nodes that are in the reachable gradient path (and hence have a pending entry) count = pending_count.get(x.op) if count is not None: if count == 1: # when count is 1 this should be last time we reach this node queue.append(x.op) pending_count[x.op] = count - 1 # Disconnect the recomputed portion of the graph from the forward pass. # This was only needed to direct the gradient flow. # Leaving these connections in place would create a circular dependancy (from added control inputs). for op in recompute_ops: # Just overwrite the backward inputs with a copy of the forward inputs. n_out = len(op.outputs) for i, x in enumerate(op.inputs[:n_out]): op._update_input(i + n_out, x) return [_GetGrad(grads, x) for x in xs]
def create_op(self, *args, **kwargs): """Creates an `Operation`. For operations of the following form orig_value = op(*args, **kwargs) this function constructs the following subgraph : v = Variable() if v is not initialized: orig_value = op(*args, **kwargs) v.assign(orig_value) # Initializes v return orig_value else: return v The above transformation is not performed and the original op is returned as is if any of the following is true: * `_return_as_is` flag is set to true. * op_type is listed in _PASS_THROUGH_OPS * op has no outputs. * One of the op's return value has a ref type. Args: *args: Arguments for create_op() **kwargs: Keyword arguments for create_op(). Refer to tensorflow.python.framework.ops.Graph.create_op() for the mandatory and optional arguments. Returns: An Operation. Raises: UnimplementedError: if output type is a reference and the op's type is not one of the supported types in `_REF_OPS_WHITELIST`. """ op_type = kwargs['op_type'] if 'op_type' in kwargs else args[0] output_dtypes = kwargs['dtypes'] if 'dtypes' in kwargs else args[2] output_dtypes = [dtypes.as_dtype(d) for d in output_dtypes] if self._return_as_is or op_type in _PASS_THROUGH_OPS: return self._wrap(super(ImperativeGraph, self).create_op(*args, **kwargs)) if not output_dtypes: return self._wrap( super(ImperativeGraph, self).create_op(*args, **kwargs)) output_has_ref = any([dtype._is_ref_dtype for dtype in output_dtypes]) # pylint: disable=protected-access if output_has_ref: if op_type not in _REF_OPS_WHITELIST: raise errors.UnimplementedError(None, None, op_type + ' op not supported in ' 'imperative graph') ret = super(ImperativeGraph, self).create_op(*args, **kwargs) if self._in_variable_creation: if op_type == 'Assign': self.add_pending_init(ret) return self._wrap(ret) with self.return_as_is(): # Declares the variables to hold the output values of this op. op_output_var = [state_ops.variable_op_v2( tensor_shape.TensorShape(None), dtype, container=self._name) for dtype in output_dtypes] # Ops to free the resources used by the temporary cache variables. # The following two ops are created for each cache variable, # having no control dependencies on any other ops : # var_handle_op ----> destroy_resource_op for dtype, v in zip(output_dtypes, op_output_var): with ops.control_dependencies(None): self._variable_cleanup_ops += [ gen_resource_variable_ops.destroy_resource_op( gen_resource_variable_ops.var_handle_op( dtype, tensor_shape.TensorShape(None), container=self._name, shared_name=v.op.name), ignore_lookup_error=True)] # Create the conditional to run the original op only when the variable # corresponding to the first output is not initialized. inited = state_ops.is_variable_initialized(op_output_var[0]) v_f, v_t = control_flow_ops.ref_switch(op_output_var[0], inited) # pylint: disable=protected-access v_f_op = gen_array_ops._ref_identity(v_f) v_t_op = gen_array_ops._ref_identity(v_t) # pylint: enable=protected-access with ops.control_dependencies([v_f_op.op]): # Create the original op orig_op = self._wrap( super(ImperativeGraph, self).create_op(*args, **kwargs)) shapes = [val.get_shape() for val in orig_op.outputs] controls = [] for var, val in zip(op_output_var, orig_op.outputs): if (not val.get_shape().is_fully_defined() or val.get_shape().num_elements() > 0): assign_op = state_ops.assign(var, val, validate_shape=False) assign_op.set_shape(val.get_shape()) controls.append(assign_op) values = [] if len(controls) > 1: if control_flow_ops.IsSwitch(orig_op): # pylint: disable=protected-access controls = gen_control_flow_ops._ref_merge(controls) # pylint: enable=protected-access else: controls = control_flow_ops.tuple(controls) for var, val in zip(op_output_var, orig_op.outputs): with ops.control_dependencies(controls): with self.colocate_with(v_f_op): real_val = array_ops.identity(val) with ops.control_dependencies([v_t_op.op]): with self.colocate_with(v_t_op): stored_val = array_ops.identity(var) stored_val.set_shape(val.get_shape()) real_val, _ = control_flow_ops.merge([real_val, stored_val]) real_val.op.node_def.attr['_gradient_op_type'].CopyFrom( attr_value_pb2.AttrValue(s=compat.as_bytes(self._merge_op_type))) values.append(real_val) for i, _ in enumerate(shapes): values[i].set_shape(shapes[i]) self._outputs_map[orig_op.name] = values try: self._gradient_function_map[orig_op.name] = ops.get_gradient_function( orig_op) except (KeyError, LookupError): pass else: orig_op.node_def.attr['_gradient_op_type'].CopyFrom( attr_value_pb2.AttrValue( s=compat.as_bytes(self._imperative_op_type))) return MultiOutputOperation(values, orig_op)
def testRegisterGradients(self): g = ops.Graph() x = an_op(g) y = copy_op(x) fn = ops.get_gradient_function(y.op) self.assertEqual(_CopyGrad, fn)
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None, stop_gradients=None): """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). `stop_gradients` is a `Tensor` or a list of tensors to be considered constant with respect to all `xs`. These tensors will not be backpropagated through, as though they had been explicitly disconnected using `stop_gradient`. Among other things, this allows computation of partial derivatives as opposed to total derivatives. For example: ```python a = tf.constant(0.) b = 2 * a g = tf.gradients(a + b, [a, b], stop_gradients=[a, b]) ``` Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the total derivatives `tf.gradients(a + b, [a, b])`, which take into account the influence of `a` on `b` and evaluate to `[3.0, 1.0]`. Note that the above is equivalent to: ```python a = tf.stop_gradient(tf.constant(0.)) b = tf.stop_gradient(2 * a) g = tf.gradients(a + b, [a, b]) ``` `stop_gradients` provides a way of stopping gradient after the graph has already been constructed, as compared to `tf.stop_gradient` which is used during graph construction. When the two approaches are combined, backpropagation stops at both `tf.stop_gradient` nodes and nodes in `stop_gradients`, whichever is encountered first. Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate through. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. RuntimeError: if called in Eager mode. """ if context.in_eager_mode(): raise RuntimeError("tf.gradients not supported in EAGER mode. Use " "functions in tf.contrib.eager.backprop instead.") ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if isinstance(x, resource_variable_ops.ResourceVariable) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs, name="x", as_ref=True) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. if len(ys) > 1: ys = [array_ops.identity(y) if y.consumers() else y for y in ys] to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) # pylint: enable=protected-access if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits( pending_count, to_ops_set) for y in loop_exits: if _IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None # pylint: disable=protected-access func_call = None is_func_call = ops.get_default_graph()._is_function(op.type) has_out_grads = any( isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op._id not in stop_ops): if is_func_call: func_call = ops.get_default_graph()._get_function( op.type) grad_fn = func_call.python_grad_func # pylint: enable=protected-access else: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th id:3537 gh:3538 # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO (apassos) gradients of resource handles might be an id:3152 gh:3153 # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[ i] = control_flow_ops.ZerosLikeOutsideLoop( op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( [x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops.colocate_with( None, ignore_existing=True): in_grads = control_flow_ops.tuple( in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(op.inputs) for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x) for x in xs]
import tensorflow as tf from tensorflow.python.framework.ops import get_gradient_function import numpy as np a = tf.add(1, 2, name="Add_these_numbers") b = tf.multiply(a, 3, name='mult') mult = tf.get_default_graph().get_operation_by_name('mult') print(get_gradient_function(mult)) # <function _MulGrad at 0x7fa29950dc80> tf.stop_gradient(a, name='stop') stop = tf.get_default_graph().get_operation_by_name('stop') print(get_gradient_function(stop)) # None c = tf.squeeze(a, name="c") mult = tf.get_default_graph().get_operation_by_name('c') print(get_gradient_function(mult)) # <function _MulGrad at 0x7fa29950dc80> indices = np.asarray([[0, 0], [1, 1]]) params = np.asarray([['a', 'b'], ['c', 'd']]) ga = tf.gather_nd(params, indices, name="ga") mult = tf.get_default_graph().get_operation_by_name('ga') print(get_gradient_function(mult)) # <function _MulGrad at 0x7fa29950dc80> am = tf.argmax(a, name="am") mult = tf.get_default_graph().get_operation_by_name('am') print(get_gradient_function(mult)) # <function _MulGrad at 0x7fa29950dc80> indice = np.asarray([[0.1, 0.3], [1.4, 1.32]]) sm = tf.nn.softmax(indice, name="sm")
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None, stop_gradients=None): """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). `stop_gradients` is a `Tensor` or a list of tensors to be considered constant with respect to all `xs`. These tensors will not be backpropagated through, as though they had been explicitly disconnected using `stop_gradient`. Among other things, this allows computation of partial derivatives as opposed to total derivatives. For example: ```python a = tf.constant(0.) b = 2 * a g = tf.gradients(a + b, [a, b], stop_gradients=[a, b]) ``` Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the total derivatives `tf.gradients(a + b, [a, b])`, which take into account the influence of `a` on `b` and evaluate to `[3.0, 1.0]`. Note that the above is equivalent to: ```python a = tf.stop_gradient(tf.constant(0.)) b = tf.stop_gradient(2 * a) g = tf.gradients(a + b, [a, b]) ``` `stop_gradients` provides a way of stopping gradient after the graph has already been constructed, as compared to `tf.stop_gradient` which is used during graph construction. When the two approaches are combined, backpropagation stops at both `tf.stop_gradient` nodes and nodes in `stop_gradients`, whichever is encountered first. Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate through. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. RuntimeError: if called in Eager mode. """ if context.in_eager_mode(): raise RuntimeError("tf.gradients not supported in EAGER mode. Use " "functions in tf.contrib.eager.backprop instead.") ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if isinstance(x, resource_variable_ops.ResourceVariable) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices( xs, name="x", as_ref=True) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. if len(ys) > 1: ys = [array_ops.identity(y) if y.consumers() else y for y in ys] to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] pending_count, loop_state = _PendingCount( ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) # pylint: enable=protected-access if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set) for y in loop_exits: if _IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None # pylint: disable=protected-access func_call = None is_func_call = ops.get_default_graph()._is_function(op.type) has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op._id not in stop_ops): if is_func_call: func_call = ops.get_default_graph()._get_function(op.type) grad_fn = func_call.python_grad_func # pylint: enable=protected-access else: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO(apassos) gradients of resource handles might be an # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile(grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile(grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len([x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops.colocate_with(None, ignore_existing=True): in_grads = control_flow_ops.tuple(in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(op.inputs) for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x) for x in xs]
def _GradientsHelper(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None, stop_gradients=None, unconnected_gradients=UnconnectedGradients.NONE, src_graph=None): """Implementation of gradients().""" if context.executing_eagerly(): raise RuntimeError( "tf.gradients is not supported when eager execution " "is enabled. Use tf.GradientTape instead.") if src_graph is None: src_graph = ops.get_default_graph() try: unconnected_gradients = UnconnectedGradients(unconnected_gradients) except ValueError: raise ValueError("Unknown value for unconnected_gradients: %r" % unconnected_gradients) # If src_graph is a _FuncGraph (i.e. a function body), gather it and all # ancestor graphs. This is necessary for correctly handling captured values. func_graphs = [] curr_graph = src_graph while _IsFunction(curr_graph): func_graphs.append(curr_graph) if isinstance(curr_graph, FuncGraph): curr_graph = curr_graph.outer_graph else: assert isinstance(curr_graph, framework_function._FuncGraph) # pylint: disable=protected-access curr_graph = curr_graph._outer_graph # pylint: disable=protected-access ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: # Get a uid for this call to gradients that can be used to help # cluster ops for compilation. gradient_uid = ops.get_default_graph().unique_name("uid") ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if resource_variable_ops.is_resource_variable(x) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs, name="x", as_ref=True) xs_set = object_identity.ObjectIdentitySet(xs) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops, gradient_uid) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] reachable_to_ops, pending_count, loop_state = _PendingCount( to_ops, from_ops, colocate_gradients_with_ops, func_graphs, xs_set) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. ready = (pending_count[op] == 0) if ready and op not in to_ops_set and op in reachable_to_ops: to_ops_set.add(op) queue.append(op) if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits( pending_count, to_ops_set) for y in loop_exits: if backprop_util.IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count, xs_set) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None func_call = None is_partitioned_call = _IsPartitionedCall(op) # pylint: disable=protected-access is_func_call = (src_graph._is_function(op.type) or is_partitioned_call) # pylint: enable=protected-access has_out_grads = any( isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op not in stop_ops): try: grad_fn = ops.get_gradient_function(op) except LookupError: if is_func_call: if is_partitioned_call: func_call = src_graph._get_function( # pylint: disable=protected-access compat.as_bytes(op.get_attr("f").name)) else: func_call = src_graph._get_function(op.type) # pylint: disable=protected-access # Note that __defun is not set if the graph is # imported. If it's set, we prefer to access the original # defun. func_call = getattr(op, "__defun", func_call) grad_fn = func_call.python_grad_func else: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) # NOTE(skyewm): We don't support computing gradients wrt a loop variable # unless it's within the context of a single iteration (i.e. the # gradient is wrt to the loop parameter in the body function, not wrt or # through the initial value). This means if we're in a while loop # context, we should never see a switch node from this context. # pylint: disable=protected-access if (control_flow_util.IsSwitch(op) and op._control_flow_context is not None and op._control_flow_context.IsWhileContext() and op._control_flow_context == ops.get_default_graph()._get_control_flow_context()): _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs_set) # pylint: enable=protected-access if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or backprop_util.IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO(apassos) gradients of resource handles might be an # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) elif default_gradient.supports_default_grad( op.outputs[i]): # TODO(b/143286622): The supports_default_grad check is needed # because While op emits non-differentiable resource tensors # as outputs. Remove this check when that is not the case. out_grads[ i] = control_flow_state.ZerosLikeOutsideLoop( op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with src_graph._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( [x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops._colocate_with_for_gradient( # pylint: disable=protected-access None, gradient_uid, ignore_existing=True): in_grads = control_flow_ops.tuple( in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(_Inputs(op, xs_set)) # Note: we don't filter out eager inputs here because the inputs need to # line up with in_grads. for i, (t_in, in_grad) in enumerate( zip(_Inputs(op, xs_set), in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) if not isinstance(t_in, ops.EagerTensor): _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state, xs_set) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x, unconnected_gradients) for x in xs]
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None): """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the partial derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. """ ys = _AsList(ys) xs = _AsList(xs) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.op_scope(ys + xs + grad_ys, name, "gradients"): ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x") grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops, from_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) if loop_state: # The "unused" exits of the loops are added to ys. As an example, # people often write: # v1, _ = While(p, b, [x1, x2]) # result = gradients(v1, x1) # The exit node of x2 is not included by the betweenness analysis. # But we need it if x2 is involved in computing v1. So we add it # back in backprop with a zeros_like gradient. loop_exits = loop_state.GetAllLoopExits() for y in loop_exits: if pending_count[y.op._id] == 0 and y.op._id not in to_ops_set: if _IsFloat(y): # Floating-point outputs get a zero gradient. _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) # The set of 'from_ops'. stop_ops = _StopOps(from_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)): if loop_state: loop_state.EnterGradWhileContext(op) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) grad_fn = None # pylint: disable=protected-access is_func_call = ops.get_default_graph()._is_function(op.type) # pylint: enable=protected-access if not is_func_call and any( out_grads) and op._id not in stop_ops: # pylint: enable=protected-access # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if (grad_fn or is_func_call) and any(out_grads): # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if not out_grad and _IsFloat(op.outputs[i]): # Only floating-point outputs get a zero gradient. Gradient # functions should ignore the gradient for other outputs. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[i] = array_ops.zeros_like( op.outputs[i]) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access wrapped_op = op if loop_state: wrapped_op = loop_state.MakeWrapper(op) if is_func_call: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. f_in = [x for x in op.inputs] + out_grads f_types = [x.dtype for x in op.inputs] # pylint: disable=protected-access in_grads = _AsList( functional_ops._symbolic_gradient( f_in, f_types, op.type)) # pylint: enable=protected-access else: in_grads = _AsList( grad_fn(wrapped_op, *out_grads)) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( tuple(filter(None, in_grads))) > 1: in_grads = control_flow_ops.tuple(in_grads) logging.vlog(1, "Gradient for '" + op.name + "'") logging.vlog(1, " in --> %s", ", ".join([x.name for x in out_grads if x])) logging.vlog(1, " out --> %s", ", ".join([x.name for x in in_grads if x])) else: # If no grad_fn is defined or none of out_grads is available, # just propagates a list of None backwards. in_grads = [None] * len(op.inputs) for t_in, in_grad in zip(op.inputs, in_grads): if in_grad: _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op) # update pending count for the inputs of op. # pylint: disable=protected-access for x in op.inputs: pending_count[x.op._id] -= 1 ready = (pending_count[x.op._id] == 0) if loop_state and not ready: ready = (pending_count[x.op._id] > 0 and control_flow_ops.IsLoopSwitch(x.op)) if ready: queue.append(x.op) for x in op.control_inputs: pending_count[x._id] -= 1 if pending_count[x._id] is 0: queue.append(x) # pylint: enable=protected-access return [_GetGrad(grads, x) for x in xs]
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None): """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the partial derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. """ ys = _AsList(ys) xs = _AsList(xs) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.op_scope(ys + xs + grad_ys, name, "gradients"): ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x") grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] pending_count, has_control_flow = _PendingCount(ops.get_default_graph(), to_ops, from_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: # pylint: disable=protected-access to_ops_set.add(op._id) queue.append(op) # The set of 'from_ops'. stop_ops = _StopOps(from_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)): if has_control_flow: control_flow_ops.EnterGradWhileContext(op) out_grads = _AggregatedGrads(grads, op, has_control_flow, aggregation_method) grad_fn = None if any(out_grads) and op._id not in stop_ops: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if grad_fn and any(out_grads): # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not out_grad and dtypes.as_dtype(op.outputs[i].dtype).base_dtype in (dtypes.float32, dtypes.float64)): # Only floating-point outputs get a zero gradient. Gradient # functions should ignore the gradient for other outputs. out_grads[i] = array_ops.zeros_like(op.outputs[i]) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access op_wrapper = op if has_control_flow: op_wrapper = control_flow_ops.MakeWrapper(op) in_grads = _AsList(grad_fn(op_wrapper, *out_grads)) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len(in_grads) > 1: in_grads = control_flow_ops.tuple(in_grads) logging.vlog(1, "Gradient for '" + op.name + "'") logging.vlog(1, " in --> %s", ", ".join([x.name for x in out_grads if x])) logging.vlog(1, " out --> %s", ", ".join([x.name for x in in_grads if x])) else: # If no grad_fn is defined or none of out_grads is available, # just propagates a list of None backwards. in_grads = [None] * len(op.inputs) for t_in, in_grad in zip(op.inputs, in_grads): if in_grad: _SetGrad(grads, t_in, in_grad) if has_control_flow: control_flow_ops.ExitGradWhileContext(op) # update pending count for the inputs of op. for x in op.inputs: pending_count[x.op._id] -= 1 ready = (pending_count[x.op._id] == 0) if has_control_flow and not ready: ready = (pending_count[x.op._id] > 0 and control_flow_ops.IsLoopSwitch(x.op)) if ready: queue.append(x.op) for x in op.control_inputs: pending_count[x._id] -= 1 if pending_count[x._id] is 0: queue.append(x) return [_GetGrad(grads, x) for x in xs]
def create_op(self, *args, **kwargs): """Creates an `Operation`. For operations of the following form orig_value = op(*args, **kwargs) this function constructs the following subgraph : v = Variable() if v is not initialized: orig_value = op(*args, **kwargs) v.assign(orig_value) # Initializes v return orig_value else: return v The above transformation is not performed and the original op is returned as is if any of the following is true: * `_return_as_is` flag is set to true. * op_type is listed in _PASS_THROUGH_OPS * op has no outputs. * One of the op's return value has a ref type. Args: *args: Arguments for create_op() **kwargs: Keyword arguments for create_op(). Refer to tensorflow.python.framework.ops.Graph.create_op() for the mandatory and optional arguments. Returns: An Operation. Raises: UnimplementedError: if output type is a reference and the op's type is not one of the supported types in `_REF_OPS_WHITELIST`. """ op_type = kwargs['op_type'] if 'op_type' in kwargs else args[0] output_dtypes = kwargs['dtypes'] if 'dtypes' in kwargs else args[2] output_dtypes = [dtypes.as_dtype(d) for d in output_dtypes] if self._return_as_is or op_type in _PASS_THROUGH_OPS: return self._wrap( super(ImperativeGraph, self).create_op(*args, **kwargs)) if not output_dtypes: return self._wrap( super(ImperativeGraph, self).create_op(*args, **kwargs)) output_has_ref = any([dtype._is_ref_dtype for dtype in output_dtypes]) # pylint: disable=protected-access if output_has_ref: if op_type not in _REF_OPS_WHITELIST: raise errors.UnimplementedError( None, None, op_type + ' op not supported in ' 'imperative graph') ret = super(ImperativeGraph, self).create_op(*args, **kwargs) if self._in_variable_creation: if op_type == 'Assign': self.add_pending_init(ret) return self._wrap(ret) with self.return_as_is(): # Declares the variables to hold the output values of this op. op_output_var = [ state_ops.variable_op_v2(tensor_shape.TensorShape(None), dtype, container=self._name) for dtype in output_dtypes ] # Ops to free the resources used by the temporary cache variables. # The following two ops are created for each cache variable, # having no control dependencies on any other ops : # var_handle_op ----> destroy_resource_op for dtype, v in zip(output_dtypes, op_output_var): with ops.control_dependencies(None): self._variable_cleanup_ops += [ gen_resource_variable_ops.destroy_resource_op( gen_resource_variable_ops.var_handle_op( dtype, tensor_shape.TensorShape(None), container=self._name, shared_name=v.op.name), ignore_lookup_error=True) ] # Create the conditional to run the original op only when the variable # corresponding to the first output is not initialized. inited = state_ops.is_variable_initialized(op_output_var[0]) v_f, v_t = control_flow_ops.ref_switch(op_output_var[0], inited) # pylint: disable=protected-access v_f_op = gen_array_ops._ref_identity(v_f) v_t_op = gen_array_ops._ref_identity(v_t) # pylint: enable=protected-access with ops.control_dependencies([v_f_op.op]): # Create the original op orig_op = self._wrap( super(ImperativeGraph, self).create_op(*args, **kwargs)) shapes = [val.get_shape() for val in orig_op.outputs] controls = [] for var, val in zip(op_output_var, orig_op.outputs): if (not val.get_shape().is_fully_defined() or val.get_shape().num_elements() > 0): assign_op = state_ops.assign(var, val, validate_shape=False) assign_op.set_shape(val.get_shape()) controls.append(assign_op) values = [] if len(controls) > 1: if control_flow_ops.IsSwitch(orig_op): # pylint: disable=protected-access controls = gen_control_flow_ops._ref_merge(controls) # pylint: enable=protected-access else: controls = control_flow_ops.tuple(controls) for var, val in zip(op_output_var, orig_op.outputs): with ops.control_dependencies(controls): with self.colocate_with(v_f_op): real_val = array_ops.identity(val) with ops.control_dependencies([v_t_op.op]): with self.colocate_with(v_t_op): stored_val = array_ops.identity(var) stored_val.set_shape(val.get_shape()) real_val, _ = control_flow_ops.merge( [real_val, stored_val]) real_val.op.node_def.attr['_gradient_op_type'].CopyFrom( attr_value_pb2.AttrValue( s=compat.as_bytes(self._merge_op_type))) values.append(real_val) for i, _ in enumerate(shapes): values[i].set_shape(shapes[i]) self._outputs_map[orig_op.name] = values try: self._gradient_function_map[ orig_op.name] = ops.get_gradient_function(orig_op) except (KeyError, LookupError): pass else: orig_op.node_def.attr['_gradient_op_type'].CopyFrom( attr_value_pb2.AttrValue( s=compat.as_bytes(self._imperative_op_type))) return MultiOutputOperation(values)
def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients): """Implementation of gradients().""" if context.executing_eagerly(): raise RuntimeError("tf.gradients not supported when eager execution " "is enabled. Use tf.contrib.eager.GradientTape " "instead.") ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: # Get a uid for this call to gradients that can be used to help # cluster ops for compilation. gradient_uid = ops.get_default_graph().unique_name("uid") ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if resource_variable_ops.is_resource_variable(x) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs, name="x", as_ref=True) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops, gradient_uid) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. if len(ys) > 1: ys = [array_ops.identity(y) if y.consumers() else y for y in ys] to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] reachable_to_ops, pending_count, loop_state = _PendingCount( ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set and op._id in reachable_to_ops: to_ops_set.add(op._id) queue.append(op) # pylint: enable=protected-access if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits( pending_count, to_ops_set) for y in loop_exits: if _IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None func_call = None # pylint: disable=protected-access is_func_call = ops.get_default_graph()._is_function(op.type) # pylint: enable=protected-access has_out_grads = any( isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op._id not in stop_ops): if is_func_call: func_call = ops.get_default_graph()._get_function( op.type) # Note that __defun is not set if the graph is # imported. If it's set, we prefer to access the original # defun. func_call = getattr(op, "__defun", func_call) grad_fn = func_call.python_grad_func else: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO(apassos) gradients of resource handles might be an # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[ i] = control_flow_ops.ZerosLikeOutsideLoop( op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile( grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( [x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops._colocate_with_for_gradient( # pylint: disable=protected-access None, gradient_uid, ignore_existing=True): in_grads = control_flow_ops.tuple( in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(op.inputs) for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x) for x in xs]
def testRegisterGradients(self): g = ops.Graph() x = an_op(g) y = copy_op(x) fn = ops.get_gradient_function(y.op) self.assertEqual(_CopyGrad, fn)
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None): """Constructs symbolic partial derivatives of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the partial derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial 'grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. """ ys = _AsList(ys) xs = _AsList(xs) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.op_scope(ys + xs + grad_ys, name, "gradients"): ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x") grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] pending_count, has_control_flow = _PendingCount( ops.get_default_graph(), to_ops, from_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: if op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) # The set of 'from_ops'. stop_ops = _StopOps(from_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with ops.device(_GetGradsDevice(op, colocate_gradients_with_ops)): if has_control_flow: control_flow_ops.EnterGradWhileContext(op) out_grads = _AggregatedGrads(grads, op, has_control_flow, aggregation_method) grad_fn = None if any(out_grads) and op._id not in stop_ops: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if grad_fn and any(out_grads): # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not out_grad and types.as_dtype(op.outputs[i].dtype).base_dtype in
def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients): """Implementation of gradients().""" if context.executing_eagerly(): raise RuntimeError("tf.gradients not supported when eager execution " "is enabled. Use tf.contrib.eager.GradientTape " "instead.") ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if resource_variable_ops.is_resource_variable(x) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices( xs, name="x", as_ref=True) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. if len(ys) > 1: ys = [array_ops.identity(y) if y.consumers() else y for y in ys] to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] pending_count, loop_state = _PendingCount( ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) # pylint: enable=protected-access if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set) for y in loop_exits: if _IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None # pylint: disable=protected-access func_call = None is_func_call = ops.get_default_graph()._is_function(op.type) has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op._id not in stop_ops): if is_func_call: func_call = ops.get_default_graph()._get_function(op.type) grad_fn = func_call.python_grad_func # pylint: enable=protected-access else: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO(apassos) gradients of resource handles might be an # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile(grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile(grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len([x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops.colocate_with(None, ignore_existing=True): in_grads = control_flow_ops.tuple(in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(op.inputs) for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x) for x in xs]
def gradients(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None): """Constructs symbolic partial derivatives of sum of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. `gradients()` adds ops to the graph to output the partial derivatives of `ys` with respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, we fill in a tensor of '1's of the shape of y for each y in `ys`. A user can provide their own initial `grad_ys` to compute the derivatives using a different initial gradient for each y (e.g., if one wanted to weight the gradient differently for each value in each y). Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. grad_ys: Optional. A `Tensor` or list of tensors the same size as `ys` and holding the gradients computed for each y in `ys`. name: Optional name to use for grouping all the gradient ops together. defaults to 'gradients'. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. gate_gradients: If True, add a tuple around the gradients returned for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of `sum(dy/dx)` for each x in `xs`. Raises: LookupError: if one of the operations between `x` and `y` does not have a registered gradient function. ValueError: if the arguments are invalid. """ ys = _AsList(ys) xs = _AsList(xs) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope(name, "gradients", ys + xs + grad_ys): ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x") grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. # pylint: disable=protected-access ready = (pending_count[op._id] == 0) if ready and op._id not in to_ops_set: to_ops_set.add(op._id) queue.append(op) # pylint: enable=protected-access if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set) for y in loop_exits: if _IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) # The set of 'from_ops'. stop_ops = _StopOps(from_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None # pylint: disable=protected-access is_func_call = ops.get_default_graph()._is_function(op.type) has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op._id not in stop_ops): if is_func_call: grad_fn = ops.get_default_graph()._get_function( op.type).python_grad_func # pylint: enable=protected-access else: # A grad_fn must be defined, either as a function or as None # for ops that do not have gradients. try: grad_fn = ops.get_gradient_function(op) except LookupError: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and _IsTrainable(op.outputs[i]): # Only floating-point outputs get a zero gradient. Gradient # functions should ignore the gradient for other outputs. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with ops.get_default_graph()._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = grad_fn(op, *out_grads) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _SymGrad(op, out_grads) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len( [x for x in in_grads if x is not None]) > 1: in_grads = control_flow_ops.tuple(in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(op.inputs) for t_in, in_grad in zip(op.inputs, in_grads): if in_grad is not None: if isinstance(in_grad, ops.Tensor): in_grad.set_shape(t_in.get_shape()) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x) for x in xs]
def _GradientsHelper(ys, xs, grad_ys=None, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None, stop_gradients=None, unconnected_gradients=UnconnectedGradients.NONE, src_graph=None): """Implementation of gradients().""" if context.executing_eagerly(): raise RuntimeError("tf.gradients is not supported when eager execution " "is enabled. Use tf.GradientTape instead.") if src_graph is None: src_graph = ops.get_default_graph() try: unconnected_gradients = UnconnectedGradients(unconnected_gradients) except ValueError: raise ValueError( "Unknown value for unconnected_gradients: %r" % unconnected_gradients) # If src_graph is a _FuncGraph (i.e. a function body), gather it and all # ancestor graphs. This is necessary for correctly handling captured values. func_graphs = [] curr_graph = src_graph while _IsFunction(curr_graph): func_graphs.append(curr_graph) if isinstance(curr_graph, FuncGraph): curr_graph = curr_graph.outer_graph else: assert isinstance(curr_graph, framework_function._FuncGraph) # pylint: disable=protected-access curr_graph = curr_graph._outer_graph # pylint: disable=protected-access ys = _AsList(ys) xs = _AsList(xs) stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) with ops.name_scope( name, "gradients", list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: # Get a uid for this call to gradients that can be used to help # cluster ops for compilation. gradient_uid = ops.get_default_graph().unique_name("uid") ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [ x.handle if resource_variable_ops.is_resource_variable(x) else x for x in xs ] xs = ops.internal_convert_n_to_tensor_or_indexed_slices( xs, name="x", as_ref=True) grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops, gradient_uid) # The approach we take here is as follows: Create a list of all ops in the # subgraph between the ys and xs. Visit these ops in reverse order of ids # to ensure that when we visit an op the gradients w.r.t its outputs have # been collected. Then aggregate these gradients if needed, call the op's # gradient function, and add the generated gradients to the gradients for # its input. # Initialize the pending count for ops in the connected subgraph from ys # to the xs. to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] stop_gradient_ops = [t.op for t in stop_gradients] reachable_to_ops, pending_count, loop_state = _PendingCount( to_ops, from_ops, colocate_gradients_with_ops, func_graphs, xs) # Iterate over the collected ops. # # grads: op => list of gradients received on each output endpoint of the # op. The gradients for each endpoint are initially collected as a list. # When it is time to call the op's gradient function, for each endpoint we # aggregate the list of received gradients into a Add() Operation if there # is more than one. grads = {} # Add the initial gradients for the ys. for y, grad_y in zip(ys, grad_ys): _SetGrad(grads, y, grad_y) # Initialize queue with to_ops. queue = collections.deque() # Add the ops in 'to_ops' into the queue. to_ops_set = set() for op in to_ops: # 'ready' handles the case where one output gradient relies on # another output's gradient. ready = (pending_count[op] == 0) if ready and op not in to_ops_set and op in reachable_to_ops: to_ops_set.add(op) queue.append(op) if loop_state: loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set) for y in loop_exits: if IsTrainable(y): _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count, xs) while queue: # generate gradient subgraph for op. op = queue.popleft() with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops): if loop_state: loop_state.EnterGradWhileContext(op, before=True) out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state, aggregation_method) if loop_state: loop_state.ExitGradWhileContext(op, before=True) grad_fn = None func_call = None is_partitioned_call = _IsPartitionedCall(op) # pylint: disable=protected-access is_func_call = ( src_graph._is_function(op.type) or is_partitioned_call) # pylint: enable=protected-access has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads) if has_out_grads and (op not in stop_ops): try: grad_fn = ops.get_gradient_function(op) except LookupError: if is_func_call: if is_partitioned_call: func_call = src_graph._get_function( # pylint: disable=protected-access compat.as_bytes(op.get_attr("f").name)) else: func_call = src_graph._get_function(op.type) # pylint: disable=protected-access # Note that __defun is not set if the graph is # imported. If it's set, we prefer to access the original # defun. func_call = getattr(op, "__defun", func_call) grad_fn = func_call.python_grad_func else: raise LookupError( "No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) if loop_state: loop_state.EnterGradWhileContext(op, before=False) # NOTE(skyewm): We don't support computing gradients wrt a loop variable # unless it's within the context of a single iteration (i.e. the # gradient is wrt to the loop parameter in the body function, not wrt or # through the initial value). This means if we're in a while loop # context, we should never see a switch node from this context. # pylint: disable=protected-access if (control_flow_util.IsSwitch(op) and op._control_flow_context is not None and op._control_flow_context.IsWhileContext() and op._control_flow_context == ops.get_default_graph()._get_control_flow_context()): _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs) # pylint: enable=protected-access if (grad_fn or is_func_call) and has_out_grads: # NOTE: If _AggregatedGrads didn't compute a value for the i'th # output, it means that the cost does not depend on output[i], # therefore dC/doutput[i] is 0. for i, out_grad in enumerate(out_grads): if (not isinstance(out_grad, ops.Tensor) and not out_grad) and ( (not grad_fn and is_func_call) or IsTrainable(op.outputs[i])): # Only trainable outputs or outputs for a function call that # will use SymbolicGradient get a zero gradient. Gradient # functions should ignore the gradient for other outputs. # TODO(apassos) gradients of resource handles might be an # issue here because of zeros. if loop_state: out_grads[i] = loop_state.ZerosLike(op, i) else: out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i) with ops.name_scope(op.name + "_grad"): # pylint: disable=protected-access with src_graph._original_op(op): # pylint: enable=protected-access if grad_fn: # If grad_fn was found, do not use SymbolicGradient even for # functions. in_grads = _MaybeCompile(grad_scope, op, func_call, lambda: grad_fn(op, *out_grads)) else: # For function call ops, we add a 'SymbolicGradient' # node to the graph to compute gradients. in_grads = _MaybeCompile(grad_scope, op, func_call, lambda: _SymGrad(op, out_grads)) in_grads = _AsList(in_grads) _VerifyGeneratedGradients(in_grads, op) if gate_gradients and len([x for x in in_grads if x is not None]) > 1: with ops.device(None): with ops._colocate_with_for_gradient( # pylint: disable=protected-access None, gradient_uid, ignore_existing=True): in_grads = control_flow_ops.tuple(in_grads) _LogOpGradients(op, out_grads, in_grads) else: # If no grad_fn is defined or none of out_grads is available, # just propagate a list of None backwards. in_grads = [None] * len(_NonEagerInputs(op, xs)) for i, (t_in, in_grad) in enumerate(zip(_NonEagerInputs(op, xs), in_grads)): if in_grad is not None: if (isinstance(in_grad, ops.Tensor) and t_in.dtype != dtypes.resource): try: in_grad.set_shape(t_in.get_shape()) except ValueError: raise ValueError( "Incompatible shapes between op input and calculated " "input gradient. Forward operation: %s. Input index: %d. " "Original input shape: %s. " "Calculated input gradient shape: %s" % (op.name, i, t_in.shape, in_grad.shape)) _SetGrad(grads, t_in, in_grad) if loop_state: loop_state.ExitGradWhileContext(op, before=False) # Update pending count for the inputs of op and enqueue ready ops. _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state, xs) if loop_state: loop_state.PostProcessing() return [_GetGrad(grads, x, unconnected_gradients) for x in xs]