def _record_gradient(op_name, inputs, attrs, results, ctx, name): """Records gradients for a TensorFlow operation. Args: op_name: Name of the TensorFlow operation (see REGISTER_OP in C++ code) to execute. inputs: A flat list of Tensor object inputs to the operation. attrs: A tuple with alternating string attr names and attr values for this operation. results: The results of the operation (as a flat list). ctx: The value of context.context(). name: Customized name for the operation. Returns: A list of maybe-wrapped results. Either Tensors or TensorNodes. Raises: An exception on error. """ if not tape.could_possibly_record(): return if op_name in _ops_which_dont_need_outputs: op_outputs = None else: # TODO(apassos) this line creates a weak circular reference where the # backprop function keeps an output alive which in turn keeps the tape entry # alive which keeps the backprop function alive. Figure out how to break # this up without breaking second derivatives of ops like Exp whose # gradients depend only on the outputs. op_outputs = results if op_name in _ops_which_dont_need_inputs: op_inputs = None else: op_inputs = inputs num_inputs = len(inputs) def grad_fn(*orig_outputs): """Generated gradient function.""" result = _magic_gradient_function(op_name, attrs, num_inputs, op_inputs, op_outputs, orig_outputs) if _tracing: print("Gradient for", (name if name else op_name), "inputs", op_inputs, "output_grads", orig_outputs, "gradients", result) return result inputs = [ops.internal_convert_to_tensor(x, ctx=ctx) for x in inputs] tape.record_operation(op_name, results, inputs, [], grad_fn) if _tracing: print("Computed op", (name if name else op_name), "inputs", inputs, "outputs", results)
def _record_gradient(op_name, inputs, attrs, results, ctx, name): """Records gradients for a TensorFlow operation. Args: op_name: Name of the TensorFlow operation (see REGISTER_OP in C++ code) to execute. inputs: A flat list of Tensor object inputs to the operation. attrs: A tuple with alternating string attr names and attr values for this operation. results: The results of the operation (as a flat list). ctx: The value of context.context(). name: Customized name for the operation. Returns: A list of maybe-wrapped results. Either Tensors or TensorNodes. Raises: An exception on error. """ if not tape.could_possibly_record(): return if op_name in _ops_which_dont_need_outputs: op_outputs = None else: # TODO(apassos) this line creates a weak circular reference where the # backprop function keeps an output alive which in turn keeps the tape entry # alive which keeps the backprop function alive. Figure out how to break # this up without breaking second derivatives of ops like Exp whose # gradients depend only on the outputs. op_outputs = results if op_name in _ops_which_dont_need_inputs: op_inputs = None else: op_inputs = inputs num_inputs = len(inputs) def grad_fn(*orig_outputs): """Generated gradient function.""" result = _magic_gradient_function(op_name, attrs, num_inputs, op_inputs, op_outputs, orig_outputs) if _tracing: print("Gradient for", (name if name else op_name), "inputs", op_inputs, "output_grads", orig_outputs, "gradients", result) return result inputs = [ops.internal_convert_to_tensor(x, ctx=ctx) for x in inputs] tape.record_operation(op_name, results, inputs, [], grad_fn) if _tracing: print("Computed op", (name if name else op_name), "inputs", inputs, "outputs", results)
def __init__(self): if context.in_eager_mode() and tape.could_possibly_record(): raise ValueError("Cannot isolate Eager execution with an active tape.") # In Eager, Graphs set a container which isolates resources, and maintain a # VariableStore which caches ResourceVariable objects created through # get_variable. So setting the default Graph has the side effect of # isolating Eager resources. with context.eager_mode(): # Create the graph in Eager mode, as this provides stricter semantics # (i.e. has a unique container prefix). This prevents implicit sharing # when a Graph-mode graph is created and then Eager mode is enabled (an # error through enable_eager_execution, but common with context managers # in unit tests). self._graph_as_default_context_manager = ops.Graph().as_default()
def __init__(self): if context.in_eager_mode() and tape.could_possibly_record(): raise ValueError("Cannot isolate Eager execution with an active tape.") # In Eager, Graphs set a container which isolates resources, and maintain a # VariableStore which caches ResourceVariable objects created through # get_variable. So setting the default Graph has the side effect of # isolating Eager resources. with context.eager_mode(): # Create the graph in Eager mode, as this provides stricter semantics # (i.e. has a unique container prefix). This prevents implicit sharing # when a Graph-mode graph is created and then Eager mode is enabled (an # error through enable_eager_execution, but common with context managers # in unit tests). self._graph_as_default_context_manager = ops.Graph().as_default()
def __call__(self, device, token, args): """Calls `self._func` in eager mode, recording the tape if needed.""" use_tape_cache = (self._support_graph_mode_gradient or tape_lib.could_possibly_record()) if use_tape_cache: with backprop.GradientTape() as tape: for tensor in args: for t in nest.flatten(tensor): if backprop_util.IsTrainable(t): tape.watch(t) outputs = self._call(device, args) tape_cache[compat.as_bytes(token)] = (tape, args, outputs) else: outputs = self._call(device, args) return outputs
def inner(*args, _checkpoint=False, _watch_vars=None, _force_seed=False, **kwargs): if _force_seed: if isinstance(_force_seed, Iterator): seed = next(_force_seed) else: seed = random.randint(1, 1 << 31) if _checkpoint and tape.could_possibly_record(): if _watch_vars is None: _watch_vars = [] watch_args = [] flat_inputs = nest.flatten(args) + nest.flatten( list(kwargs.values())) flat_inputs = [x for x in flat_inputs if tf.is_tensor(x)] flat_inputs = [x for x in flat_inputs if x.dtype == tf.float32] unique_inputs = [ x.deref() for x in set(x.experimental_ref() for x in flat_inputs) ] unique_vars = [ v.deref() for v in set(v.experimental_ref() for v in _watch_vars) if not any(v is inp for inp in flat_inputs) ] watches = unique_inputs + unique_vars tensor_watches = [tf.convert_to_tensor(x) for x in watches] with tape.stop_recording(): if _force_seed: tf.random.set_seed(seed) result = f(*args, **kwargs) flat_result = nest.flatten(result) # No idea what the point of this is but they do it in tf.custom_gradient so I'm doing it too flat_result = [tf.identity(x) for x in flat_result] output = nest.pack_sequence_as(result, flat_result) del flat_inputs del result del unique_inputs del unique_vars def grad(*output_grads): with tf.GradientTape() as g: g.watch(watches) if _force_seed: tf.random.set_seed(seed) recomputed_output = f(*args, **kwargs) recomputed_output = [ tf.identity(x) for x in nest.flatten(recomputed_output) ] grads = g.gradient(recomputed_output, watches, output_gradients=output_grads) del g return grads tape.record_operation(str(f), flat_result, tensor_watches, grad) return output else: if _force_seed: tf.random.set_seed(seed) return f(*args, **kwargs)