def testTriL(self): with self.cached_session(): shift = np.array([-1, 0, 1], dtype=np.float32) tril = np.array([[[3, 0, 0], [2, -1, 0], [3, 2, 1]], [[2, 0, 0], [3, -2, 0], [4, 3, 2]]], dtype=np.float32) scale = linalg.LinearOperatorLowerTriangular(tril, is_non_singular=True) affine = AffineLinearOperator(shift=shift, scale=scale, validate_args=True) x = np.array([[[1, 0, -1], [2, 3, 4]], [[4, 1, -7], [6, 9, 8]]], dtype=np.float32) # If we made the bijector do x*A+b then this would be simplified to: # y = np.matmul(x, tril) + shift. y = np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift ildj = -np.sum( np.log(np.abs(np.diagonal(tril, axis1=-2, axis2=-1)))) self.assertEqual(affine.name, "affine_linear_operator") self.assertAllClose(y, affine.forward(x).eval()) self.assertAllClose(x, affine.inverse(y).eval()) self.assertAllClose( ildj, affine.inverse_log_det_jacobian(y, event_ndims=2).eval()) self.assertAllClose( -affine.inverse_log_det_jacobian(y, event_ndims=2).eval(), affine.forward_log_det_jacobian(x, event_ndims=2).eval())
def testTriL(self): with self.test_session(): shift = np.array([-1, 0, 1], dtype=np.float32) tril = np.array([[[1, 0, 0], [2, -1, 0], [3, 2, 1]], [[2, 0, 0], [3, -2, 0], [4, 3, 2]]], dtype=np.float32) scale = linalg.LinearOperatorLowerTriangular(tril, is_non_singular=True) affine = AffineLinearOperator( shift=shift, scale=scale, validate_args=True) x = np.array([[[1, 0, -1], [2, 3, 4]], [[4, 1, -7], [6, 9, 8]]], dtype=np.float32) # If we made the bijector do x*A+b then this would be simplified to: # y = np.matmul(x, tril) + shift. y = np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift ildj = -np.sum(np.log(np.abs(np.diagonal( tril, axis1=-2, axis2=-1))), axis=-1) self.assertEqual(affine.name, "affine_linear_operator") self.assertAllClose(y, affine.forward(x).eval()) self.assertAllClose(x, affine.inverse(y).eval()) self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval()) self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(), affine.forward_log_det_jacobian(x).eval())
def testIdentity(self): with self.test_session(): affine = AffineLinearOperator(validate_args=True) x = np.array([[1, 0, -1], [2, 3, 4]], dtype=np.float32) y = x ildj = 0. self.assertEqual(affine.name, "affine_linear_operator") self.assertAllClose(y, affine.forward(x).eval()) self.assertAllClose(x, affine.inverse(y).eval()) self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval()) self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(), affine.forward_log_det_jacobian(x).eval())
def testIdentity(self): with self.test_session(): affine = AffineLinearOperator( validate_args=True) x = np.array([[1, 0, -1], [2, 3, 4]], dtype=np.float32) y = x ildj = 0. self.assertEqual(affine.name, "affine_linear_operator") self.assertAllClose(y, affine.forward(x).eval()) self.assertAllClose(x, affine.inverse(y).eval()) self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval()) self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(), affine.forward_log_det_jacobian(x).eval())
def testDiag(self): with self.test_session(): shift = np.array([-1, 0, 1], dtype=np.float32) diag = np.array([[1, 2, 3], [2, 5, 6]], dtype=np.float32) scale = linalg.LinearOperatorDiag(diag, is_non_singular=True) affine = AffineLinearOperator(shift=shift, scale=scale, validate_args=True) x = np.array([[1, 0, -1], [2, 3, 4]], dtype=np.float32) y = diag * x + shift ildj = -np.sum(np.log(np.abs(diag)), axis=-1) self.assertEqual(affine.name, "affine_linear_operator") self.assertAllClose(y, affine.forward(x).eval()) self.assertAllClose(x, affine.inverse(y).eval()) self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval()) self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(), affine.forward_log_det_jacobian(x).eval())
def testDiag(self): with self.test_session(): shift = np.array([-1, 0, 1], dtype=np.float32) diag = np.array([[1, 2, 3], [2, 5, 6]], dtype=np.float32) scale = linalg.LinearOperatorDiag(diag, is_non_singular=True) affine = AffineLinearOperator( shift=shift, scale=scale, validate_args=True) x = np.array([[1, 0, -1], [2, 3, 4]], dtype=np.float32) y = diag * x + shift ildj = -np.sum(np.log(np.abs(diag)), axis=-1) self.assertEqual(affine.name, "affine_linear_operator") self.assertAllClose(y, affine.forward(x).eval()) self.assertAllClose(x, affine.inverse(y).eval()) self.assertAllClose(ildj, affine.inverse_log_det_jacobian(y).eval()) self.assertAllClose(-affine.inverse_log_det_jacobian(y).eval(), affine.forward_log_det_jacobian(x).eval())
def __init__(self, mix_loc, temperature, distribution, loc=None, scale=None, quadrature_size=8, quadrature_fn=quadrature_scheme_softmaxnormal_quantiles, validate_args=False, allow_nan_stats=True, name="VectorDiffeomixture"): """Constructs the VectorDiffeomixture on `R^d`. The vector diffeomixture (VDM) approximates the compound distribution ```none p(x) = int p(x | z) p(z) dz, where z is in the K-simplex, and p(x | z) := p(x | loc=sum_k z[k] loc[k], scale=sum_k z[k] scale[k]) ``` Args: mix_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`. In terms of samples, larger `mix_loc[..., k]` ==> `Z` is more likely to put more weight on its `kth` component. temperature: `float`-like `Tensor`. Broadcastable with `mix_loc`. In terms of samples, smaller `temperature` means one component is more likely to dominate. I.e., smaller `temperature` makes the VDM look more like a standard mixture of `K` components. distribution: `tf.Distribution`-like instance. Distribution from which `d` iid samples are used as input to the selected affine transformation. Must be a scalar-batch, scalar-event distribution. Typically `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is a function of non-trainable parameters. WARNING: If you backprop through a VectorDiffeomixture sample and the `distribution` is not `FULLY_REPARAMETERIZED` yet is a function of trainable variables, then the gradient will be incorrect! loc: Length-`K` list of `float`-type `Tensor`s. The `k`-th element represents the `shift` used for the `k`-th affine transformation. If the `k`-th item is `None`, `loc` is implicitly `0`. When specified, must have shape `[B1, ..., Bb, d]` where `b >= 0` and `d` is the event size. scale: Length-`K` list of `LinearOperator`s. Each should be positive-definite and operate on a `d`-dimensional vector space. The `k`-th element represents the `scale` used for the `k`-th affine transformation. `LinearOperator`s must have shape `[B1, ..., Bb, d, d]`, `b >= 0`, i.e., characterizes `b`-batches of `d x d` matrices quadrature_size: Python `int` scalar representing number of quadrature points. Larger `quadrature_size` means `q_N(x)` better approximates `p(x)`. quadrature_fn: Python callable taking `normal_loc`, `normal_scale`, `quadrature_size`, `validate_args` and returning `tuple(grid, probs)` representing the SoftmaxNormal grid and corresponding normalized weight. normalized) weight. Default value: `quadrature_scheme_softmaxnormal_quantiles`. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: ValueError: if `not scale or len(scale) < 2`. ValueError: if `len(loc) != len(scale)` ValueError: if `quadrature_grid_and_probs is not None` and `len(quadrature_grid_and_probs[0]) != len(quadrature_grid_and_probs[1])` ValueError: if `validate_args` and any not scale.is_positive_definite. TypeError: if any scale.dtype != scale[0].dtype. TypeError: if any loc.dtype != scale[0].dtype. NotImplementedError: if `len(scale) != 2`. ValueError: if `not distribution.is_scalar_batch`. ValueError: if `not distribution.is_scalar_event`. """ parameters = dict(locals()) with ops.name_scope(name, values=[mix_loc, temperature]) as name: if not scale or len(scale) < 2: raise ValueError( "Must specify list (or list-like object) of scale " "LinearOperators, one for each component with " "num_component >= 2.") if loc is None: loc = [None] * len(scale) if len(loc) != len(scale): raise ValueError("loc/scale must be same-length lists " "(or same-length list-like objects).") dtype = scale[0].dtype.base_dtype loc = [ ops.convert_to_tensor( loc_, dtype=dtype, name="loc{}".format(k)) if loc_ is not None else None for k, loc_ in enumerate(loc) ] for k, scale_ in enumerate(scale): if validate_args and not scale_.is_positive_definite: raise ValueError( "scale[{}].is_positive_definite = {} != True".format( k, scale_.is_positive_definite)) if scale_.dtype.base_dtype != dtype: raise TypeError( "dtype mismatch; scale[{}].base_dtype=\"{}\" != \"{}\"" .format(k, scale_.dtype.base_dtype.name, dtype.name)) self._endpoint_affine = [ AffineLinearOperator(shift=loc_, scale=scale_, validate_args=validate_args, name="endpoint_affine_{}".format(k)) for k, (loc_, scale_) in enumerate(zip(loc, scale)) ] # TODO(jvdillon): Remove once we support k-mixtures. # We make this assertion here because otherwise `grid` would need to be a # vector not a scalar. if len(scale) != 2: raise NotImplementedError( "Currently only bimixtures are supported; " "len(scale)={} is not 2.".format(len(scale))) mix_loc = ops.convert_to_tensor(mix_loc, dtype=dtype, name="mix_loc") temperature = ops.convert_to_tensor(temperature, dtype=dtype, name="temperature") self._grid, probs = tuple( quadrature_fn(mix_loc / temperature, 1. / temperature, quadrature_size, validate_args)) # Note: by creating the logits as `log(prob)` we ensure that # `self.mixture_distribution.logits` is equivalent to # `math_ops.log(self.mixture_distribution.probs)`. self._mixture_distribution = categorical_lib.Categorical( logits=math_ops.log(probs), validate_args=validate_args, allow_nan_stats=allow_nan_stats) asserts = distribution_util.maybe_check_scalar_distribution( distribution, dtype, validate_args) if asserts: self._grid = control_flow_ops.with_dependencies( asserts, self._grid) self._distribution = distribution self._interpolated_affine = [ AffineLinearOperator(shift=loc_, scale=scale_, validate_args=validate_args, name="interpolated_affine_{}".format(k)) for k, (loc_, scale_) in enumerate( zip(interpolate_loc(self._grid, loc), interpolate_scale(self._grid, scale))) ] [ self._batch_shape_, self._batch_shape_tensor_, self._event_shape_, self._event_shape_tensor_, ] = determine_batch_event_shapes(self._grid, self._endpoint_affine) super(VectorDiffeomixture, self).__init__( dtype=dtype, # We hard-code `FULLY_REPARAMETERIZED` because when # `validate_args=True` we verify that indeed # `distribution.reparameterization_type == FULLY_REPARAMETERIZED`. A # distribution which is a function of only non-trainable parameters # also implies we can use `FULLY_REPARAMETERIZED`. However, we cannot # easily test for that possibility thus we use `validate_args=False` # as a "back-door" to allow users a way to use non # `FULLY_REPARAMETERIZED` distribution. In such cases IT IS THE USERS # RESPONSIBILITY to verify that the base distribution is a function of # non-trainable parameters. reparameterization_type=distribution_lib.FULLY_REPARAMETERIZED, validate_args=validate_args, allow_nan_stats=allow_nan_stats, parameters=parameters, graph_parents=( distribution._graph_parents # pylint: disable=protected-access + [loc_ for loc_ in loc if loc_ is not None] + [p for scale_ in scale for p in scale_.graph_parents]), name=name)
def __init__(self, mix_loc, mix_scale, distribution, loc=None, scale=None, quadrature_grid_and_probs=None, validate_args=False, allow_nan_stats=True, name="VectorDiffeomixture"): """Constructs the VectorDiffeomixture on `R**k`. Args: mix_loc: `float`-like `Tensor`. Represents the `location` parameter of the SoftmaxNormal used for selecting one of the `K` affine transformations. mix_scale: `float`-like `Tensor`. Represents the `scale` parameter of the SoftmaxNormal used for selecting one of the `K` affine transformations. distribution: `tf.Distribution`-like instance. Distribution from which `d` iid samples are used as input to the selected affine transformation. Must be a scalar-batch, scalar-event distribution. Typically `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is a function of non-trainable parameters. WARNING: If you backprop through a VectorDiffeomixture sample and the `distribution` is not `FULLY_REPARAMETERIZED` yet is a function of trainable variables, then the gradient will be incorrect! loc: Length-`K` list of `float`-type `Tensor`s. The `k`-th element represents the `shift` used for the `k`-th affine transformation. If the `k`-th item is `None`, `loc` is implicitly `0`. When specified, must have shape `[B1, ..., Bb, d]` where `b >= 0` and `d` is the event size. scale: Length-`K` list of `LinearOperator`s. Each should be positive-definite and operate on a `d`-dimensional vector space. The `k`-th element represents the `scale` used for the `k`-th affine transformation. `LinearOperator`s must have shape `[B1, ..., Bb, d, d]`, `b >= 0`, i.e., characterizes `b`-batches of `d x d` matrices quadrature_grid_and_probs: Python pair of `list`-like objects representing the sample points and the corresponding (possibly normalized) weight. When `None`, defaults to: `np.polynomial.hermite.hermgauss(deg=8)`. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: ValueError: if `not scale or len(scale) < 2`. ValueError: if `len(loc) != len(scale)` ValueError: if `quadrature_grid_and_probs is not None` and `len(quadrature_grid_and_probs[0]) != len(quadrature_grid_and_probs[1])` ValueError: if `validate_args` and any not scale.is_positive_definite. TypeError: if any scale.dtype != scale[0].dtype. TypeError: if any loc.dtype != scale[0].dtype. NotImplementedError: if `len(scale) != 2`. ValueError: if `not distribution.is_scalar_batch`. ValueError: if `not distribution.is_scalar_event`. """ parameters = locals() with ops.name_scope(name, values=[mix_loc, mix_scale]): if not scale or len(scale) < 2: raise ValueError( "Must specify list (or list-like object) of scale " "LinearOperators, one for each component with " "num_component >= 2.") if loc is None: loc = [None] * len(scale) if len(loc) != len(scale): raise ValueError("loc/scale must be same-length lists " "(or same-length list-like objects).") dtype = scale[0].dtype.base_dtype loc = [ ops.convert_to_tensor( loc_, dtype=dtype, name="loc{}".format(k)) if loc_ is not None else None for k, loc_ in enumerate(loc) ] for k, scale_ in enumerate(scale): if validate_args and not scale_.is_positive_definite: raise ValueError( "scale[{}].is_positive_definite = {} != True".format( k, scale_.is_positive_definite)) if scale_.dtype.base_dtype != dtype: raise TypeError( "dtype mismatch; scale[{}].base_dtype=\"{}\" != \"{}\"" .format(k, scale_.dtype.base_dtype.name, dtype.name)) self._endpoint_affine = [ AffineLinearOperator(shift=loc_, scale=scale_, event_ndims=1, validate_args=validate_args, name="endpoint_affine_{}".format(k)) for k, (loc_, scale_) in enumerate(zip(loc, scale)) ] # TODO(jvdillon): Remove once we support k-mixtures. # We make this assertion here because otherwise `grid` would need to be a # vector not a scalar. if len(scale) != 2: raise NotImplementedError( "Currently only bimixtures are supported; " "len(scale)={} is not 2.".format(len(scale))) if quadrature_grid_and_probs is None: grid, probs = np.polynomial.hermite.hermgauss(deg=8) else: grid, probs = tuple(quadrature_grid_and_probs) if len(grid) != len(probs): raise ValueError( "`quadrature_grid_and_probs` must be a `tuple` of " "same-length list-like objects") grid = grid.astype(dtype.as_numpy_dtype) probs = probs.astype(dtype.as_numpy_dtype) probs /= np.linalg.norm(probs, ord=1) self._quadrature_grid = grid self._quadrature_probs = probs # Note: by creating the logits as `log(prob)` we ensure that # `self.mixture_distribution.logits` is equivalent to # `math_ops.log(self.mixture_distribution.probs)`. self._mixture_distribution = categorical_lib.Categorical( logits=np.log(probs), validate_args=validate_args, allow_nan_stats=allow_nan_stats) mix_loc = maybe_check_mix_param(mix_loc, "mix_loc", dtype, validate_args) mix_scale = maybe_check_mix_param(mix_scale, "mix_scale", dtype, validate_args) asserts = distribution_util.maybe_check_scalar_distribution( distribution, dtype, validate_args) if asserts: mix_loc = control_flow_ops.with_dependencies(asserts, mix_loc) self._distribution = distribution # shape: [B, deg] self._interpolate_weight = math_ops.sigmoid(mix_loc + np.sqrt(2.) * mix_scale * grid) self._interpolated_affine = [ AffineLinearOperator(shift=loc_, scale=scale_, event_ndims=1, validate_args=validate_args, name="interpolated_affine_{}".format(k)) for k, (loc_, scale_) in enumerate( zip( interpolate_loc(len(self._quadrature_grid), self._interpolate_weight, loc), interpolate_scale(len(self._quadrature_grid), self._interpolate_weight, scale))) ] self._batch_shape_, self._event_shape_ = determine_batch_event_shapes( mix_loc, mix_scale, self._endpoint_affine) super(VectorDiffeomixture, self).__init__( dtype=dtype, # We hard-code `FULLY_REPARAMETERIZED` because when # `validate_args=True` we verify that indeed # `distribution.reparameterization_type == FULLY_REPARAMETERIZED`. A # distribution which is a function of only non-trainable parameters # also implies we can use `FULLY_REPARAMETERIZED`. However, we cannot # easily test for that possibility thus we use `validate_args=False` # as a "back-door" to allow users a way to use non # `FULLY_REPARAMETERIZED` distribution. In such cases IT IS THE USERS # RESPONSIBILITY to verify that the base distribution is a function of # non-trainable parameters. reparameterization_type=distribution_lib.FULLY_REPARAMETERIZED, validate_args=validate_args, allow_nan_stats=allow_nan_stats, parameters=parameters, graph_parents=( [mix_loc, mix_scale] + distribution._graph_parents # pylint: disable=protected-access + [loc_ for loc_ in loc if loc_ is not None] + [p for scale_ in scale for p in scale_.graph_parents]), name=name)