def testVerifyTensorAllFiniteSucceeds(self): x_shape = [5, 4] x = np.random.random_sample(x_shape).astype(np.float32) with test_util.use_gpu(): t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32) t_verified = numerics.verify_tensor_all_finite(t, "Input is not a number.") self.assertAllClose(x, self.evaluate(t_verified))
def testVerifyTensorAllFiniteFails(self): x_shape = [5, 4] x = np.random.random_sample(x_shape).astype(np.float32) my_msg = "Input is not a number." # Test NaN. x[0] = np.nan with test_util.use_gpu(): with self.assertRaisesOpError(my_msg): t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32) t_verified = numerics.verify_tensor_all_finite(t, my_msg) self.evaluate(t_verified) # Test Inf. x[0] = np.inf with test_util.use_gpu(): with self.assertRaisesOpError(my_msg): t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32) t_verified = numerics.verify_tensor_all_finite(t, my_msg) self.evaluate(t_verified)
def verify_tensor_all_finite(labeled_tensor, message, name=None): """Asserts a tensor doesn't contain NaNs or Infs. See tf.verify_tensor_all_finite. Args: labeled_tensor: The input tensor. message: Message to log on failure. name: Optional op name. Returns: The input tensor. """ with ops.name_scope(name, 'lt_verify_tensor_all_finite', [labeled_tensor]) as scope: labeled_tensor = core.convert_to_labeled_tensor(labeled_tensor) op = numerics.verify_tensor_all_finite( labeled_tensor.tensor, msg=message, name=scope) return core.LabeledTensor(op, labeled_tensor.axes)
def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None): """Clips values of multiple tensors by the ratio of the sum of their norms. Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`, this operation returns a list of clipped tensors `list_clipped` and the global norm (`global_norm`) of all tensors in `t_list`. Optionally, if you've already computed the global norm for `t_list`, you can specify the global norm with `use_norm`. To perform the clipping, the values `t_list[i]` are set to: t_list[i] * clip_norm / max(global_norm, clip_norm) where: global_norm = sqrt(sum([l2norm(t)**2 for t in t_list])) If `clip_norm > global_norm` then the entries in `t_list` remain as they are, otherwise they're all shrunk by the global ratio. Any of the entries of `t_list` that are of type `None` are ignored. This is the correct way to perform gradient clipping (for example, see [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063) ([pdf](http://arxiv.org/pdf/1211.5063.pdf))). However, it is slower than `clip_by_norm()` because all the parameters must be ready before the clipping operation can be performed. Args: t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio. use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global norm to use. If not provided, `global_norm()` is used to compute the norm. name: A name for the operation (optional). Returns: list_clipped: A list of `Tensors` of the same type as `list_t`. global_norm: A 0-D (scalar) `Tensor` representing the global norm. Raises: TypeError: If `t_list` is not a sequence. InvalidArgumentError: If global norm is not finite. """ if (not isinstance(t_list, collections.Sequence) or isinstance(t_list, six.string_types)): raise TypeError("t_list should be a sequence") t_list = list(t_list) if use_norm is None: use_norm = global_norm(t_list, name) use_norm = numerics.verify_tensor_all_finite(use_norm, "Found Inf or NaN global norm.") with ops.name_scope(name, "clip_by_global_norm", t_list + [clip_norm]) as name: # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm scale = clip_norm * math_ops.minimum( 1.0 / use_norm, constant_op.constant(1.0, dtype=use_norm.dtype) / clip_norm) values = [ ops.convert_to_tensor( t.values if isinstance(t, ops.IndexedSlices) else t, name="t_%d" % i) if t is not None else t for i, t in enumerate(t_list)] values_clipped = [] for i, v in enumerate(values): if v is None: values_clipped.append(None) else: with ops.colocate_with(v): values_clipped.append( array_ops.identity(v * scale, name="%s_%d" % (name, i))) list_clipped = [ ops.IndexedSlices(c_v, t.indices, t.dense_shape) if isinstance(t, ops.IndexedSlices) else c_v for (c_v, t) in zip(values_clipped, t_list)] return list_clipped, use_norm
def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None): """Clips values of multiple tensors by the ratio of the sum of their norms. Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`, this operation returns a list of clipped tensors `list_clipped` and the global norm (`global_norm`) of all tensors in `t_list`. Optionally, if you've already computed the global norm for `t_list`, you can specify the global norm with `use_norm`. To perform the clipping, the values `t_list[i]` are set to: t_list[i] * clip_norm / max(global_norm, clip_norm) where: global_norm = sqrt(sum([l2norm(t)**2 for t in t_list])) If `clip_norm > global_norm` then the entries in `t_list` remain as they are, otherwise they're all shrunk by the global ratio. Any of the entries of `t_list` that are of type `None` are ignored. This is the correct way to perform gradient clipping (for example, see [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063) ([pdf](http://arxiv.org/pdf/1211.5063.pdf))). However, it is slower than `clip_by_norm()` because all the parameters must be ready before the clipping operation can be performed. Args: t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio. use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global norm to use. If not provided, `global_norm()` is used to compute the norm. name: A name for the operation (optional). Returns: list_clipped: A list of `Tensors` of the same type as `list_t`. global_norm: A 0-D (scalar) `Tensor` representing the global norm. Raises: TypeError: If `t_list` is not a sequence. InvalidArgumentError: If global norm is not finite. """ if (not isinstance(t_list, collections.Sequence) or isinstance(t_list, six.string_types)): raise TypeError("t_list should be a sequence") t_list = list(t_list) if use_norm is None: use_norm = global_norm(t_list, name) use_norm = numerics.verify_tensor_all_finite( use_norm, "Found Inf or NaN global norm.") with ops.name_scope(name, "clip_by_global_norm", t_list + [clip_norm]) as name: # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm scale = clip_norm * math_ops.minimum( 1.0 / use_norm, constant_op.constant(1.0, dtype=use_norm.dtype) / clip_norm) values = [ ops.convert_to_tensor( t.values if isinstance(t, ops.IndexedSlices) else t, name="t_%d" % i) if t is not None else t for i, t in enumerate(t_list) ] values_clipped = [] for i, v in enumerate(values): if v is None: values_clipped.append(None) else: with ops.colocate_with(v): values_clipped.append( array_ops.identity(v * scale, name="%s_%d" % (name, i))) list_clipped = [ ops.IndexedSlices(c_v, t.indices, t.dense_shape) if isinstance( t, ops.IndexedSlices) else c_v for (c_v, t) in zip(values_clipped, t_list) ] return list_clipped, use_norm
def do_filter(self, estimated_state, estimated_state_covariance, predicted_observation, predicted_observation_covariance, observation, observation_model, observation_noise): """Convenience function for scoring predictions. Scores a prediction against an observation, and computes the updated posterior over states. Shapes given below for arguments are for single-model Kalman filtering (e.g. KalmanFilter). For ensembles, prior_state and prior_state_var are same-length tuples of values corresponding to each model. Args: estimated_state: A prior mean over states [batch size x state dimension] estimated_state_covariance: Covariance of state prior [batch size x D x D], with D depending on the Kalman filter implementation (typically the state dimension). predicted_observation: A prediction for the observed value, such as that returned by observed_from_state. A [batch size x num features] Tensor. predicted_observation_covariance: A covariance matrix corresponding to `predicted_observation`, a [batch size x num features x num features] Tensor. observation: The observed value corresponding to the predictions given [batch size x observation dimension] observation_model: The [batch size x observation dimension x model state dimension] Tensor indicating how a particular state is mapped to (pre-noise) observations for each part of the batch. observation_noise: A [batch size x observation dimension x observation dimension] Tensor or [observation dimension x observation dimension] Tensor with covariance matrices to use for each part of the batch (a two-dimensional input will be broadcast). Returns: posterior_state, posterior_state_var: Posterior mean and covariance, updated versions of prior_state and prior_state_var. log_prediction_prob: Log probability of the observations under the priors, suitable for optimization (should be maximized). """ symmetrized_observation_covariance = 0.5 * ( predicted_observation_covariance + array_ops.matrix_transpose(predicted_observation_covariance)) instability_message = ( "This may occur due to numerically unstable filtering when there is " "a large difference in posterior variances, or when inferences are " "near-deterministic. Considering tuning the " "'filtering_maximum_posterior_variance_ratio' or " "'filtering_minimum_posterior_variance' parameters in your " "StateSpaceModelConfiguration, or tuning the transition matrix.") symmetrized_observation_covariance = numerics.verify_tensor_all_finite( symmetrized_observation_covariance, "Predicted observation covariance was not finite. {}".format( instability_message)) diag = array_ops.matrix_diag_part(symmetrized_observation_covariance) min_diag = math_ops.reduce_min(diag) non_negative_assert = control_flow_ops.Assert( min_diag >= 0., [("The predicted observation covariance " "has a negative diagonal entry. {}").format(instability_message), min_diag]) with ops.control_dependencies([non_negative_assert]): observation_covariance_cholesky = linalg_ops.cholesky( symmetrized_observation_covariance) log_prediction_prob = distributions.MultivariateNormalTriL( predicted_observation, observation_covariance_cholesky).log_prob(observation) (posterior_state, posterior_state_var) = self.posterior_from_prior_state( prior_state=estimated_state, prior_state_var=estimated_state_covariance, observation=observation, observation_model=observation_model, predicted_observations=(predicted_observation, predicted_observation_covariance), observation_noise=observation_noise) return (posterior_state, posterior_state_var, log_prediction_prob)
def do_filter( self, estimated_state, estimated_state_covariance, predicted_observation, predicted_observation_covariance, observation, observation_model, observation_noise): """Convenience function for scoring predictions. Scores a prediction against an observation, and computes the updated posterior over states. Shapes given below for arguments are for single-model Kalman filtering (e.g. KalmanFilter). For ensembles, prior_state and prior_state_var are same-length tuples of values corresponding to each model. Args: estimated_state: A prior mean over states [batch size x state dimension] estimated_state_covariance: Covariance of state prior [batch size x D x D], with D depending on the Kalman filter implementation (typically the state dimension). predicted_observation: A prediction for the observed value, such as that returned by observed_from_state. A [batch size x num features] Tensor. predicted_observation_covariance: A covariance matrix corresponding to `predicted_observation`, a [batch size x num features x num features] Tensor. observation: The observed value corresponding to the predictions given [batch size x observation dimension] observation_model: The [batch size x observation dimension x model state dimension] Tensor indicating how a particular state is mapped to (pre-noise) observations for each part of the batch. observation_noise: A [batch size x observation dimension x observation dimension] Tensor or [observation dimension x observation dimension] Tensor with covariance matrices to use for each part of the batch (a two-dimensional input will be broadcast). Returns: posterior_state, posterior_state_var: Posterior mean and covariance, updated versions of prior_state and prior_state_var. log_prediction_prob: Log probability of the observations under the priors, suitable for optimization (should be maximized). """ symmetrized_observation_covariance = 0.5 * ( predicted_observation_covariance + array_ops.matrix_transpose( predicted_observation_covariance)) instability_message = ( "This may occur due to numerically unstable filtering when there is " "a large difference in posterior variances, or when inferences are " "near-deterministic. Considering tuning the " "'filtering_maximum_posterior_variance_ratio' or " "'filtering_minimum_posterior_variance' parameters in your " "StateSpaceModelConfiguration, or tuning the transition matrix.") symmetrized_observation_covariance = numerics.verify_tensor_all_finite( symmetrized_observation_covariance, "Predicted observation covariance was not finite. {}".format( instability_message)) diag = array_ops.matrix_diag_part(symmetrized_observation_covariance) min_diag = math_ops.reduce_min(diag) non_negative_assert = control_flow_ops.Assert( min_diag >= 0., [("The predicted observation covariance " "has a negative diagonal entry. {}").format(instability_message), min_diag]) with ops.control_dependencies([non_negative_assert]): observation_covariance_cholesky = linalg_ops.cholesky( symmetrized_observation_covariance) log_prediction_prob = distributions.MultivariateNormalTriL( predicted_observation, observation_covariance_cholesky).log_prob( observation) (posterior_state, posterior_state_var) = self.posterior_from_prior_state( prior_state=estimated_state, prior_state_var=estimated_state_covariance, observation=observation, observation_model=observation_model, predicted_observations=(predicted_observation, predicted_observation_covariance), observation_noise=observation_noise) return (posterior_state, posterior_state_var, log_prediction_prob)