def _compute_energy_change(current_target_log_prob, current_momentums, proposed_target_log_prob, proposed_momentums, independent_chain_ndims, name=None): """Helper to `kernel` which computes the energy change.""" with ops.name_scope(name, "compute_energy_change", ([ current_target_log_prob, proposed_target_log_prob, independent_chain_ndims ] + current_momentums + proposed_momentums)): # Abbreviate lk0=log_kinetic_energy and lk1=proposed_log_kinetic_energy # since they're a mouthful and lets us inline more. lk0, lk1 = [], [] for current_momentum, proposed_momentum in zip(current_momentums, proposed_momentums): axis = math_ops.range(independent_chain_ndims, array_ops.rank(current_momentum)) lk0.append(_log_sum_sq(current_momentum, axis)) lk1.append(_log_sum_sq(proposed_momentum, axis)) lk0 = -np.log(2.) + math_ops.reduce_logsumexp( array_ops.stack(lk0, axis=-1), axis=-1) lk1 = -np.log(2.) + math_ops.reduce_logsumexp( array_ops.stack(lk1, axis=-1), axis=-1) lp0 = -current_target_log_prob # log_potential lp1 = -proposed_target_log_prob # proposed_log_potential x = array_ops.stack( [lp1, math_ops.exp(lk1), -lp0, -math_ops.exp(lk0)], axis=-1) # The sum is NaN if any element is NaN or we see both +Inf and -Inf. # Thus we will replace such rows with infinite energy change which implies # rejection. Recall that float-comparisons with NaN are always False. is_sum_determinate = ( math_ops.reduce_all(math_ops.is_finite(x) | (x >= 0.), axis=-1) & math_ops.reduce_all(math_ops.is_finite(x) | (x <= 0.), axis=-1)) is_sum_determinate = array_ops.tile( is_sum_determinate[..., array_ops.newaxis], multiples=array_ops.concat([ array_ops.ones(array_ops.rank(is_sum_determinate), dtype=dtypes.int32), [4], ], axis=0)) x = array_ops.where( is_sum_determinate, x, array_ops.fill(array_ops.shape(x), value=x.dtype.as_numpy_dtype(np.inf))) return math_ops.reduce_sum(x, axis=-1)
def _compute_energy_change(current_target_log_prob, current_momentums, proposed_target_log_prob, proposed_momentums, independent_chain_ndims, name=None): """Helper to `kernel` which computes the energy change.""" with ops.name_scope( name, "compute_energy_change", ([current_target_log_prob, proposed_target_log_prob, independent_chain_ndims] + current_momentums + proposed_momentums)): # Abbreviate lk0=log_kinetic_energy and lk1=proposed_log_kinetic_energy # since they're a mouthful and lets us inline more. lk0, lk1 = [], [] for current_momentum, proposed_momentum in zip(current_momentums, proposed_momentums): axis = math_ops.range(independent_chain_ndims, array_ops.rank(current_momentum)) lk0.append(_log_sum_sq(current_momentum, axis)) lk1.append(_log_sum_sq(proposed_momentum, axis)) lk0 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk0, axis=-1), axis=-1) lk1 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk1, axis=-1), axis=-1) lp0 = -current_target_log_prob # log_potential lp1 = -proposed_target_log_prob # proposed_log_potential x = array_ops.stack([lp1, math_ops.exp(lk1), -lp0, -math_ops.exp(lk0)], axis=-1) # The sum is NaN if any element is NaN or we see both +Inf and -Inf. # Thus we will replace such rows with infinite energy change which implies # rejection. Recall that float-comparisons with NaN are always False. is_sum_determinate = ( math_ops.reduce_all(math_ops.is_finite(x) | (x >= 0.), axis=-1) & math_ops.reduce_all(math_ops.is_finite(x) | (x <= 0.), axis=-1)) is_sum_determinate = array_ops.tile( is_sum_determinate[..., array_ops.newaxis], multiples=array_ops.concat([ array_ops.ones(array_ops.rank(is_sum_determinate), dtype=dtypes.int32), [4], ], axis=0)) x = array_ops.where(is_sum_determinate, x, array_ops.fill(array_ops.shape(x), value=x.dtype.as_numpy_dtype(np.inf))) return math_ops.reduce_sum(x, axis=-1)
def _is_all_finite(grads): """Returns a scalar boolean tensor indicating if all gradients are finite.""" is_finite_per_grad = [ math_ops.reduce_all(math_ops.is_finite(g)) for g in grads if g is not None ] return math_ops.reduce_all(is_finite_per_grad)
def adaptive_runge_kutta_step(rk_state, history, n_steps): """Take an adaptive Runge-Kutta step to integrate the ODE.""" y0, f0, _, t0, dt, interp_coeff = rk_state check_underflow = control_flow_ops.Assert(t0 + dt > t0, ['underflow in dt', dt]) check_max_num_steps = control_flow_ops.Assert( n_steps < max_num_steps, ['max_num_steps exceeded']) check_numerics = control_flow_ops.Assert( math_ops.reduce_all(math_ops.is_finite(abs(y0))), ['non-finite values in state `y`', y0]) y1, f1, y1_error, k = _runge_kutta_step(func, y0, f0, t0, dt) error_tol = atol + rtol * math_ops.maximum(abs(y0), abs(y1)) tensor_error_ratio = _abs_square(y1_error) / _abs_square(error_tol) error_ratio = math_ops.sqrt(math_ops.reduce_mean(tensor_error_ratio)) accept_step = error_ratio <= 1 y_next = control_flow_ops.cond(accept_step, lambda: y1, lambda: y0) f_next = control_flow_ops.cond(accept_step, lambda: f1, lambda: f0) t_next = control_flow_ops.cond(accept_step, lambda: t0 + dt, lambda: t0) interp_coeff = control_flow_ops.cond( accept_step, lambda: _interp_fit_rk(y0, y1, k, dt), lambda: interp_coeff) dt_next = _optimal_step_size(dt, error_ratio, safety, ifactor, dfactor) rk_state = _RungeKuttaState(y_next, f_next, t0, t_next, dt_next, interp_coeff) history = _History(_ta_append(history.integrate_points, t0 + dt), _ta_append(history.error_ratio, error_ratio)) return rk_state, history, n_steps + 1
def adaptive_runge_kutta_step(rk_state, history, n_steps): """Take an adaptive Runge-Kutta step to integrate the ODE.""" ys0, fs0, _, t0, us0, dt, interp_coeff = rk_state with ops.name_scope('assertions'): check_underflow = control_flow_ops.Assert( (t0 + dt > t0 and first_step > 0) or (t0 + dt < t0 and first_step < 0), ['underflow in dt', dt]) check_max_num_steps = control_flow_ops.Assert( n_steps < max_num_steps, ['max_num_steps exceeded']) check_numerics = _traverse_and_return_flattened( ys0, lambda y, _: control_flow_ops.Assert( math_ops.reduce_all(math_ops.is_finite(abs(y))), ['non-finite values in state `y`', y])) with ops.control_dependencies( [check_underflow, check_max_num_steps] + check_numerics): ys1, fs1, ys1_error, ks = _runge_kutta_step( func, ys0, fs0, t0, us0, dt) with ops.name_scope('error_ratio'): # We use the same approach as the dopri5 fortran code. error_tol = _multi_traverse_and_return_nested( [ys0, ys1], lambda y0, y1, _: atol + rtol * math_ops.maximum( abs(y0), abs(y1))) tensor_error_ratio = _multi_traverse_and_return_nested( [ys1_error, error_tol], lambda err, tol, _: _abs_square(err) / _abs_square(tol)) # Could also use reduce_maximum here. error_ratio = math_ops.sqrt( math_ops.reduce_mean( _traverse_and_return_flattened( tensor_error_ratio, lambda err, _: math_ops.reduce_mean(err)))) accept_step = error_ratio <= 1 with ops.name_scope('update/rk_state'): # If we don't accept the step, the _RungeKuttaState will be useless # (covering a time-interval of size 0), but that's OK, because in such # cases we always immediately take another Runge-Kutta step. ys_next = control_flow_ops.cond(accept_step, lambda: ys1, lambda: ys0) fs_next = control_flow_ops.cond(accept_step, lambda: fs1, lambda: fs0) ts_next = control_flow_ops.cond(accept_step, lambda: t0 + dt, lambda: t0) us_next = us0 interp_coeff = control_flow_ops.cond( accept_step, lambda: _interp_fit_rk(ys0, ys1, ks, dt), lambda: interp_coeff) dt_next = _optimal_step_size(dt, error_ratio, safety, ifactor, dfactor) rk_state = _RungeKuttaState(ys_next, fs_next, t0, ts_next, us_next, dt_next, interp_coeff) with ops.name_scope('update/history'): history = _History( _ta_append(history.integrate_points, t0 + dt), _ta_append(history.error_ratio, error_ratio)) return rk_state, history, n_steps + 1
def _compare(self, x, use_gpu): np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x) with test_util.device(use_gpu=use_gpu): inx = ops.convert_to_tensor(x) ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf( inx), math_ops.is_nan(inx) tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan]) self.assertAllEqual(np_inf, tf_inf) self.assertAllEqual(np_nan, tf_nan) self.assertAllEqual(np_finite, tf_finite) self.assertShapeEqual(np_inf, oinf) self.assertShapeEqual(np_nan, onan) self.assertShapeEqual(np_finite, ofinite)
def _compare(self, x, use_gpu): np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x) with self.test_session( use_gpu=use_gpu, force_gpu=use_gpu and test_util.is_gpu_available()) as sess: inx = ops.convert_to_tensor(x) ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf( inx), math_ops.is_nan(inx) tf_finite, tf_inf, tf_nan = sess.run([ofinite, oinf, onan]) self.assertAllEqual(np_inf, tf_inf) self.assertAllEqual(np_nan, tf_nan) self.assertAllEqual(np_finite, tf_finite) self.assertShapeEqual(np_inf, oinf) self.assertShapeEqual(np_nan, onan) self.assertShapeEqual(np_finite, ofinite)
def adaptive_runge_kutta_step(rk_state, history, n_steps): """Take an adaptive Runge-Kutta step to integrate the ODE.""" y0, f0, _, t0, dt, interp_coeff = rk_state with ops.name_scope('assertions'): check_underflow = control_flow_ops.Assert( t0 + dt > t0, ['underflow in dt', dt]) check_max_num_steps = control_flow_ops.Assert( n_steps < max_num_steps, ['max_num_steps exceeded']) check_numerics = control_flow_ops.Assert( math_ops.reduce_all(math_ops.is_finite(abs(y0))), ['non-finite values in state `y`', y0]) with ops.control_dependencies( [check_underflow, check_max_num_steps, check_numerics]): y1, f1, y1_error, k = _runge_kutta_step(func, y0, f0, t0, dt) with ops.name_scope('error_ratio'): # We use the same approach as the dopri5 fortran code. error_tol = atol + rtol * math_ops.maximum(abs(y0), abs(y1)) tensor_error_ratio = _abs_square(y1_error) / _abs_square( error_tol) # Could also use reduce_maximum here. error_ratio = math_ops.sqrt( math_ops.reduce_mean(tensor_error_ratio)) accept_step = error_ratio <= 1 with ops.name_scope('update/rk_state'): # If we don't accept the step, the _RungeKuttaState will be useless # (covering a time-interval of size 0), but that's OK, because in such # cases we always immediately take another Runge-Kutta step. y_next = control_flow_ops.cond(accept_step, lambda: y1, lambda: y0) f_next = control_flow_ops.cond(accept_step, lambda: f1, lambda: f0) t_next = control_flow_ops.cond(accept_step, lambda: t0 + dt, lambda: t0) interp_coeff = control_flow_ops.cond( accept_step, lambda: _interp_fit_rk(y0, y1, k, dt), lambda: interp_coeff) dt_next = _optimal_step_size(dt, error_ratio, safety, ifactor, dfactor) rk_state = _RungeKuttaState(y_next, f_next, t0, t_next, dt_next, interp_coeff) with ops.name_scope('update/history'): history = _History( _ta_append(history.integrate_points, t0 + dt), _ta_append(history.error_ratio, error_ratio)) return rk_state, history, n_steps + 1
def _compare(self, x, use_gpu): with test_util.device(use_gpu=use_gpu): inx = ops.convert_to_tensor(x) ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf( inx), math_ops.is_nan(inx) tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan]) if x.dtype == dtypes_lib.bfloat16.as_numpy_dtype: # Numpy will implicitly convert bfloat16 value to float16, so we cast to # float32 to avoid this. x = x.astype(np.float32) np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x) self.assertAllEqual(np_inf, tf_inf) self.assertAllEqual(np_nan, tf_nan) self.assertAllEqual(np_finite, tf_finite) self.assertShapeEqual(np_inf, oinf) self.assertShapeEqual(np_nan, onan) self.assertShapeEqual(np_finite, ofinite)
def adaptive_runge_kutta_step(rk_state, history, n_steps): """Take an adaptive Runge-Kutta step to integrate the ODE.""" y0, f0, _, t0, dt, interp_coeff = rk_state with ops.name_scope('assertions'): check_underflow = control_flow_ops.Assert(t0 + dt > t0, ['underflow in dt', dt]) check_max_num_steps = control_flow_ops.Assert( n_steps < max_num_steps, ['max_num_steps exceeded']) check_numerics = control_flow_ops.Assert( math_ops.reduce_all(math_ops.is_finite(abs(y0))), ['non-finite values in state `y`', y0]) with ops.control_dependencies( [check_underflow, check_max_num_steps, check_numerics]): y1, f1, y1_error, k = _runge_kutta_step(func, y0, f0, t0, dt) with ops.name_scope('error_ratio'): # We use the same approach as the dopri5 fortran code. error_tol = atol + rtol * math_ops.maximum(abs(y0), abs(y1)) tensor_error_ratio = _abs_square(y1_error) / _abs_square(error_tol) # Could also use reduce_maximum here. error_ratio = math_ops.sqrt(math_ops.reduce_mean(tensor_error_ratio)) accept_step = error_ratio <= 1 with ops.name_scope('update/rk_state'): # If we don't accept the step, the _RungeKuttaState will be useless # (covering a time-interval of size 0), but that's OK, because in such # cases we always immediately take another Runge-Kutta step. y_next = control_flow_ops.cond(accept_step, lambda: y1, lambda: y0) f_next = control_flow_ops.cond(accept_step, lambda: f1, lambda: f0) t_next = control_flow_ops.cond(accept_step, lambda: t0 + dt, lambda: t0) interp_coeff = control_flow_ops.cond( accept_step, lambda: _interp_fit_rk(y0, y1, k, dt), lambda: interp_coeff) dt_next = _optimal_step_size(dt, error_ratio, safety, ifactor, dfactor) rk_state = _RungeKuttaState(y_next, f_next, t0, t_next, dt_next, interp_coeff) with ops.name_scope('update/history'): history = _History( _ta_append(history.integrate_points, t0 + dt), _ta_append(history.error_ratio, error_ratio)) return rk_state, history, n_steps + 1
def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None): """Clips values of multiple tensors by the ratio of the sum of their norms. Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`, this operation returns a list of clipped tensors `list_clipped` and the global norm (`global_norm`) of all tensors in `t_list`. Optionally, if you've already computed the global norm for `t_list`, you can specify the global norm with `use_norm`. To perform the clipping, the values `t_list[i]` are set to: t_list[i] * clip_norm / max(global_norm, clip_norm) where: global_norm = sqrt(sum([l2norm(t)**2 for t in t_list])) If `clip_norm > global_norm` then the entries in `t_list` remain as they are, otherwise they're all shrunk by the global ratio. If `global_norm == infinity` then the entries in `t_list` are all set to `NaN` to signal that an error occurred. Any of the entries of `t_list` that are of type `None` are ignored. This is the correct way to perform gradient clipping (for example, see [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063) ([pdf](http://arxiv.org/pdf/1211.5063.pdf))). However, it is slower than `clip_by_norm()` because all the parameters must be ready before the clipping operation can be performed. Args: t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio. use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global norm to use. If not provided, `global_norm()` is used to compute the norm. name: A name for the operation (optional). Returns: list_clipped: A list of `Tensors` of the same type as `list_t`. global_norm: A 0-D (scalar) `Tensor` representing the global norm. Raises: TypeError: If `t_list` is not a sequence. """ if (not isinstance(t_list, collections_abc.Sequence) or isinstance(t_list, six.string_types)): raise TypeError("t_list should be a sequence") t_list = list(t_list) if use_norm is None: use_norm = global_norm(t_list, name) with ops.name_scope(name, "clip_by_global_norm", t_list + [clip_norm]) as name: # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm scale_for_finite = clip_norm * math_ops.minimum( 1.0 / use_norm, constant_op.constant(1.0, dtype=use_norm.dtype) / clip_norm) scale = array_ops.where( math_ops.is_finite(use_norm), scale_for_finite, # Return NaN if use_norm is not finite. constant_op.constant(float("nan"), dtype=use_norm.dtype)) values = [ ops.convert_to_tensor( t.values if isinstance(t, ops.IndexedSlices) else t, name="t_%d" % i) if t is not None else t for i, t in enumerate(t_list) ] values_clipped = [] for i, v in enumerate(values): if v is None: values_clipped.append(None) else: with ops.colocate_with(v): values_clipped.append( array_ops.identity(v * scale, name="%s_%d" % (name, i))) list_clipped = [ ops.IndexedSlices(c_v, t.indices, t.dense_shape) if isinstance( t, ops.IndexedSlices) else c_v for (c_v, t) in zip(values_clipped, t_list) ] return list_clipped, use_norm
def _assign_if_finite(var, value): """Assigns a value to a variable if the value is finite.""" return control_flow_ops.cond(math_ops.is_finite(value), lambda: _op_in_graph_mode(var.assign(value)), control_flow_ops.no_op)
def matrix_exponential(input, name=None): # pylint: disable=redefined-builtin r"""Computes the matrix exponential of one or more square matrices. exp(A) = \sum_{n=0}^\infty A^n/n! The exponential is computed using a combination of the scaling and squaring method and the Pade approximation. Details can be found in: Nicholas J. Higham, "The scaling and squaring method for the matrix exponential revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005. The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form square matrices. The output is a tensor of the same shape as the input containing the exponential for all input submatrices `[..., :, :]`. Args: input: A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`, or `complex128` with shape `[..., M, M]`. name: A name to give this `Op` (optional). Returns: the matrix exponential of the input. Raises: ValueError: An unsupported type is provided as input. @compatibility(scipy) Equivalent to scipy.linalg.expm @end_compatibility """ with ops.name_scope(name, 'matrix_exponential', [input]): matrix = ops.convert_to_tensor(input, name='input') if matrix.shape[-2:] == [0, 0]: return matrix batch_shape = matrix.shape[:-2] if not batch_shape.is_fully_defined(): batch_shape = array_ops.shape(matrix)[:-2] # reshaping the batch makes the where statements work better matrix = array_ops.reshape( matrix, array_ops.concat(([-1], array_ops.shape(matrix)[-2:]), axis=0)) l1_norm = math_ops.reduce_max( math_ops.reduce_sum( math_ops.abs(matrix), axis=array_ops.size(array_ops.shape(matrix)) - 2), axis=-1)[..., array_ops.newaxis, array_ops.newaxis] const = lambda x: constant_op.constant(x, l1_norm.dtype) def _nest_where(vals, cases): assert len(vals) == len(cases) - 1 if len(vals) == 1: return array_ops.where_v2( math_ops.less(l1_norm, const(vals[0])), cases[0], cases[1]) else: return array_ops.where_v2( math_ops.less(l1_norm, const(vals[0])), cases[0], _nest_where(vals[1:], cases[1:])) if matrix.dtype in [dtypes.float16, dtypes.float32, dtypes.complex64]: maxnorm = const(3.925724783138660) squarings = math_ops.maximum( math_ops.floor( math_ops.log(l1_norm / maxnorm) / math_ops.log(const(2.0))), 0) u3, v3 = _matrix_exp_pade3(matrix) u5, v5 = _matrix_exp_pade5(matrix) u7, v7 = _matrix_exp_pade7( matrix / math_ops.cast(math_ops.pow(const(2.0), squarings), matrix.dtype)) conds = (4.258730016922831e-001, 1.880152677804762e+000) u = _nest_where(conds, (u3, u5, u7)) v = _nest_where(conds, (v3, v5, v7)) elif matrix.dtype in [dtypes.float64, dtypes.complex128]: maxnorm = const(5.371920351148152) squarings = math_ops.maximum( math_ops.floor( math_ops.log(l1_norm / maxnorm) / math_ops.log(const(2.0))), 0) u3, v3 = _matrix_exp_pade3(matrix) u5, v5 = _matrix_exp_pade5(matrix) u7, v7 = _matrix_exp_pade7(matrix) u9, v9 = _matrix_exp_pade9(matrix) u13, v13 = _matrix_exp_pade13( matrix / math_ops.cast(math_ops.pow(const(2.0), squarings), matrix.dtype)) conds = (1.495585217958292e-002, 2.539398330063230e-001, 9.504178996162932e-001, 2.097847961257068e+000) u = _nest_where(conds, (u3, u5, u7, u9, u13)) v = _nest_where(conds, (v3, v5, v7, v9, v13)) else: raise ValueError('tf.linalg.expm does not support matrices of type %s' % matrix.dtype) is_finite = math_ops.is_finite(math_ops.reduce_max(l1_norm)) nan = constant_op.constant(np.nan, matrix.dtype) result = control_flow_ops.cond( is_finite, lambda: linalg_ops.matrix_solve(-u + v, u + v), lambda: array_ops.fill(array_ops.shape(matrix), nan)) max_squarings = math_ops.reduce_max(squarings) i = const(0.0) def c(i, _): return control_flow_ops.cond(is_finite, lambda: math_ops.less(i, max_squarings), lambda: constant_op.constant(False)) def b(i, r): return i + 1, array_ops.where_v2( math_ops.less(i, squarings), math_ops.matmul(r, r), r) _, result = control_flow_ops.while_loop(c, b, [i, result]) if not matrix.shape.is_fully_defined(): return array_ops.reshape( result, array_ops.concat((batch_shape, array_ops.shape(result)[-2:]), axis=0)) return array_ops.reshape(result, batch_shape.concatenate(result.shape[-2:]))
def _assign_if_finite(var, value): """Assigns a value to a variable if the value is finite.""" return control_flow_ops.cond( math_ops.is_finite(value), lambda: _op_in_graph_mode(var.assign(value)), control_flow_ops.no_op)
def _is_all_finite(grads): """Returns a scalar boolean tensor indicating if all gradients are finite.""" is_finite_per_grad = [math_ops.reduce_all(math_ops.is_finite(g)) for g in grads] return math_ops.reduce_all(is_finite_per_grad)
def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None): """Clips values of multiple tensors by the ratio of the sum of their norms. Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`, this operation returns a list of clipped tensors `list_clipped` and the global norm (`global_norm`) of all tensors in `t_list`. Optionally, if you've already computed the global norm for `t_list`, you can specify the global norm with `use_norm`. To perform the clipping, the values `t_list[i]` are set to: t_list[i] * clip_norm / max(global_norm, clip_norm) where: global_norm = sqrt(sum([l2norm(t)**2 for t in t_list])) If `clip_norm > global_norm` then the entries in `t_list` remain as they are, otherwise they're all shrunk by the global ratio. If `global_norm == infinity` then the entries in `t_list` are all set to `NaN` to signal that an error occurred. Any of the entries of `t_list` that are of type `None` are ignored. This is the correct way to perform gradient clipping (for example, see [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063) ([pdf](http://arxiv.org/pdf/1211.5063.pdf))). However, it is slower than `clip_by_norm()` because all the parameters must be ready before the clipping operation can be performed. Args: t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio. use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global norm to use. If not provided, `global_norm()` is used to compute the norm. name: A name for the operation (optional). Returns: list_clipped: A list of `Tensors` of the same type as `list_t`. global_norm: A 0-D (scalar) `Tensor` representing the global norm. Raises: TypeError: If `t_list` is not a sequence. """ if (not isinstance(t_list, collections.Sequence) or isinstance(t_list, six.string_types)): raise TypeError("t_list should be a sequence") t_list = list(t_list) if use_norm is None: use_norm = global_norm(t_list, name) with ops.name_scope(name, "clip_by_global_norm", t_list + [clip_norm]) as name: # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm scale_for_finite = clip_norm * math_ops.minimum( 1.0 / use_norm, constant_op.constant(1.0, dtype=use_norm.dtype) / clip_norm) scale = array_ops.where( math_ops.is_finite(use_norm), scale_for_finite, # Return NaN if use_norm is not finite. constant_op.constant(float("nan"), dtype=use_norm.dtype)) values = [ ops.convert_to_tensor( t.values if isinstance(t, ops.IndexedSlices) else t, name="t_%d" % i) if t is not None else t for i, t in enumerate(t_list)] values_clipped = [] for i, v in enumerate(values): if v is None: values_clipped.append(None) else: with ops.colocate_with(v): values_clipped.append( array_ops.identity(v * scale, name="%s_%d" % (name, i))) list_clipped = [ ops.IndexedSlices(c_v, t.indices, t.dense_shape) if isinstance(t, ops.IndexedSlices) else c_v for (c_v, t) in zip(values_clipped, t_list)] return list_clipped, use_norm