def _forward(self, x): with tf.control_dependencies(self._maybe_assert_valid_x(x)): if self.power == 0.: return tf.exp(x) # If large x accuracy is an issue, consider using: # (1. + x * self.power)**(1. / self.power) when x >> 1. return tf.exp(tf.math.log1p(x * self.power) / self.power)
def _marginal_hidden_probs(self): """Compute marginal pdf for each individual observable.""" initial_log_probs = tf.broadcast_to( self._log_init, tf.concat([self.batch_shape_tensor(), [self._num_states]], axis=0)) # initial_log_probs :: batch_shape num_states def _scan_multiple_steps(): """Perform `scan` operation when `num_steps` > 1.""" transition_log_probs = self._log_trans def forward_step(log_probs, _): return _log_vector_matrix(log_probs, transition_log_probs) dummy_index = tf.zeros(self._num_steps - 1, dtype=tf.float32) forward_log_probs = tf.scan(forward_step, dummy_index, initializer=initial_log_probs, name="forward_log_probs") return tf.concat([[initial_log_probs], forward_log_probs], axis=0) forward_log_probs = prefer_static.cond( self._num_steps > 1, _scan_multiple_steps, lambda: initial_log_probs[tf.newaxis, ...]) return tf.exp(forward_log_probs)
def loop_body(should_continue, k): """Resample the non-accepted points.""" # The range of U is chosen so that the resulting sample K lies in # [0, tf.int64.max). The final sample, if accepted, is K + 1. u = tf.random.uniform( shape, minval=minval_u, maxval=maxval_u, dtype=power.dtype, seed=seed()) # Sample the point X from the continuous density h(x) \propto x^(-power). x = self._hat_integral_inverse(u, power=power) # Rejection-inversion requires a `hat` function, h(x) such that # \int_{k - .5}^{k + .5} h(x) dx >= pmf(k + 1) for points k in the # support. A natural hat function for us is h(x) = x^(-power). # # After sampling X from h(x), suppose it lies in the interval # (K - .5, K + .5) for integer K. Then the corresponding K is accepted if # if lies to the left of x_K, where x_K is defined by: # \int_{x_k}^{K + .5} h(x) dx = H(x_K) - H(K + .5) = pmf(K + 1), # where H(x) = \int_x^inf h(x) dx. # Solving for x_K, we find that x_K = H_inverse(H(K + .5) + pmf(K + 1)). # Or, the acceptance condition is X <= H_inverse(H(K + .5) + pmf(K + 1)). # Since X = H_inverse(U), this simplifies to U <= H(K + .5) + pmf(K + 1). # Update the non-accepted points. # Since X \in (K - .5, K + .5), the sample K is chosen as floor(X + 0.5). k = tf.where(should_continue, tf.floor(x + 0.5), k) accept = (u <= self._hat_integral(k + .5, power=power) + tf.exp( self._log_prob(k + 1, power=power))) return [should_continue & (~accept), k]
def _inverse(self, y): x = tf.identity(y) if self.shift is not None: x = x - self.shift if self.scale is not None: x = x / self.scale if self.log_scale is not None: x = x * tf.exp(-self.log_scale) return x
def _forward(self, x): y = tf.identity(x) if self.scale is not None: y = y * self.scale if self.log_scale is not None: y = y * tf.exp(self.log_scale) if self.shift is not None: y = y + self.shift return y
def _sample_n(self, n, seed=None): concentration = tf.convert_to_tensor(self.concentration) scale = tf.convert_to_tensor(self.scale) shape = tf.concat( [[n], self._batch_shape_tensor(concentration=concentration, scale=scale)], axis=0) sampled = tf.random.uniform(shape, maxval=1., seed=seed, dtype=self.dtype) log_sample = tf.math.log(scale) - tf.math.log1p(-sampled) / concentration return tf.exp(log_sample)
def grad(dy): """Computes a derivative for the min and max parameters. This function implements the derivative wrt the truncation bounds, which get blocked by the sampler. We use a custom expression for numerical stability instead of automatic differentiation on CDF for implicit gradients. Args: dy: output gradients Returns: The standard normal samples and the gradients wrt the upper bound and lower bound. """ # std_samples has an extra dimension (the sample dimension), expand # lower and upper so they broadcast along this dimension. # See note above regarding parameterized_truncated_normal, the sample # dimension is the final dimension. lower_broadcast = lower[..., tf.newaxis] upper_broadcast = upper[..., tf.newaxis] cdf_samples = ((special_math.ndtr(std_samples) - special_math.ndtr(lower_broadcast)) / (special_math.ndtr(upper_broadcast) - special_math.ndtr(lower_broadcast))) # tiny, eps are tolerance parameters to ensure we stay away from giving # a zero arg to the log CDF expression. tiny = np.finfo(dtype_util.as_numpy_dtype(self.dtype)).tiny eps = np.finfo(dtype_util.as_numpy_dtype(self.dtype)).eps cdf_samples = tf.clip_by_value(cdf_samples, tiny, 1 - eps) du = tf.exp(0.5 * (std_samples**2 - upper_broadcast**2) + tf.math.log(cdf_samples)) dl = tf.exp(0.5 * (std_samples**2 - lower_broadcast**2) + tf.math.log1p(-cdf_samples)) # Reduce the gradient across the samples grad_u = tf.reduce_sum(dy * du, axis=-1) grad_l = tf.reduce_sum(dy * dl, axis=-1) return [grad_l, grad_u]
def _cdf(self, x): with tf.control_dependencies(self._maybe_assert_valid_sample(x)): concentration = tf.convert_to_tensor(self.concentration) loc = tf.convert_to_tensor(self.loc) return ( special_math.ndtr( ((concentration / x) ** 0.5 * (x / loc - 1.))) + tf.exp(2. * concentration / loc) * special_math.ndtr( -(concentration / x) ** 0.5 * (x / loc + 1)))
def _log_cdf(self, x): scale = tf.convert_to_tensor(self.scale) concentration = tf.convert_to_tensor(self.concentration) z = self._z(x, scale, concentration) eq_zero = tf.equal(concentration, 0) # Concentration = 0 ==> Exponential. nonzero_conc = tf.where(eq_zero, tf.constant(1, self.dtype), concentration) where_nonzero = tf.math.log1p(-(1 + nonzero_conc * z)**(-1 / nonzero_conc)) where_zero = tf.math.log1p(-tf.exp(-z)) return tf.where(eq_zero, where_zero, where_nonzero)
def _bessel_ive(v, z, cache=None): """Computes I_v(z)*exp(-abs(z)) using a recurrence relation, where z > 0.""" # TODO(b/67497980): Switch to a more numerically faithful implementation. z = tf.convert_to_tensor(z) wrap = lambda result: tf.debugging.check_numerics(result, 'besseli{}'.format(v )) if float(v) >= 2: raise ValueError( 'Evaluating bessel_i by recurrence becomes imprecise for large v') cache = cache or {} safe_z = tf.where(z > 0, z, tf.ones_like(z)) if v in cache: return wrap(cache[v]) if v == 0: cache[v] = tf.math.bessel_i0e(z) elif v == 1: cache[v] = tf.math.bessel_i1e(z) elif v == 0.5: # sinh(x)*exp(-abs(x)), sinh(x) = (e^x - e^{-x}) / 2 sinhe = lambda x: (tf.exp(x - tf.abs(x)) - tf.exp(-x - tf.abs(x))) / 2 cache[v] = ( np.sqrt(2 / np.pi) * sinhe(z) * tf.where(z > 0, tf.math.rsqrt(safe_z), tf.ones_like(safe_z))) elif v == -0.5: # cosh(x)*exp(-abs(x)), cosh(x) = (e^x + e^{-x}) / 2 coshe = lambda x: (tf.exp(x - tf.abs(x)) + tf.exp(-x - tf.abs(x))) / 2 cache[v] = ( np.sqrt(2 / np.pi) * coshe(z) * tf.where(z > 0, tf.math.rsqrt(safe_z), tf.ones_like(safe_z))) if v <= 1: return wrap(cache[v]) # Recurrence relation: cache[v] = (_bessel_ive(v - 2, z, cache) - (2 * (v - 1)) * _bessel_ive(v - 1, z, cache) / z) return wrap(cache[v])
def _log_prob(self, x): scale = tf.convert_to_tensor(self.scale) # The exact HalfCauchy-Normal marginal log-density is analytically # intractable; we compute a (relatively accurate) numerical # approximation. This is a log space version of ref[2] from class docstring. xx = (x / scale)**2 / 2 g = 0.5614594835668851 # tf.exp(-0.5772156649015328606) b = 1.0420764938351215 # tf.sqrt(2 * (1-g) / (g * (2-g))) h_inf = 1.0801359952503342 # (1-g)*(g*g-6*g+12) / (3*g * (2-g)**2 * b) q = 20. / 47. * xx**1.0919284281983377 h = 1. / (1 + xx**(1.5)) + h_inf * q / (1 + q) c = -.5 * np.log(2 * np.pi**3) - tf.math.log(g * scale) return -tf.math.log1p( (1 - g) / g * tf.exp(-xx / (1 - g))) + tf.math.log( tf.math.log1p(g / xx - (1 - g) / (h + b * xx)**2)) + c
def _logsum_expbig_minus_expsmall(big, small): """Stable evaluation of `Log[exp{big} - exp{small}]`. To work correctly, we should have the pointwise relation: `small <= big`. Args: big: Floating-point `Tensor` small: Floating-point `Tensor` with same `dtype` as `big` and broadcastable shape. Returns: `Tensor` of same `dtype` of `big` and broadcast shape. """ with tf.name_scope("logsum_expbig_minus_expsmall"): return tf.math.log1p(-tf.exp(small - big)) + big
def _sample_n(self, n, seed=None): # Here we use the fact that if: # lam ~ Gamma(concentration=total_count, rate=(1-probs)/probs) # then X ~ Poisson(lam) is Negative Binomially distributed. logits = self._logits_parameter_no_checks() stream = SeedStream(seed, salt='NegativeBinomial') rate = tf.random.gamma(shape=[n], alpha=self.total_count, beta=tf.exp(-logits), dtype=self.dtype, seed=stream()) return tf.random.poisson(lam=rate, shape=[], dtype=self.dtype, seed=stream())
def _finish_prob_for_one_fiber(self, y, x, ildj, event_ndims, **distribution_kwargs): """Finish computation of prob on one element of the inverse image.""" x = self._maybe_rotate_dims(x, rotate_right=True) prob = self.distribution.prob(x, **distribution_kwargs) if self._is_maybe_event_override: prob = tf.reduce_prod(prob, axis=self._reduce_event_indices) prob = prob * tf.exp(tf.cast(ildj, prob.dtype)) if self._is_maybe_event_override and isinstance(event_ndims, int): tensorshape_util.set_shape( prob, tf.broadcast_static_shape( tensorshape_util.with_rank_at_least(y.shape, 1)[:-event_ndims], self.batch_shape)) return prob
def log_cdf_laplace(x, name="log_cdf_laplace"): """Log Laplace distribution function. This function calculates `Log[L(x)]`, where `L(x)` is the cumulative distribution function of the Laplace distribution, i.e. ```L(x) := 0.5 * int_{-infty}^x e^{-|t|} dt``` For numerical accuracy, `L(x)` is computed in different ways depending on `x`, ``` x <= 0: Log[L(x)] = Log[0.5] + x, which is exact 0 < x: Log[L(x)] = Log[1 - 0.5 * e^{-x}], which is exact ``` Args: x: `Tensor` of type `float32`, `float64`. name: Python string. A name for the operation (default="log_ndtr"). Returns: `Tensor` with `dtype=x.dtype`. Raises: TypeError: if `x.dtype` is not handled. """ with tf.name_scope(name): x = tf.convert_to_tensor(x, name="x") # For x < 0, L(x) = 0.5 * exp{x} exactly, so Log[L(x)] = log(0.5) + x. lower_solution = -np.log(2.) + x # safe_exp_neg_x = exp{-x} for x > 0, but is # bounded above by 1, which avoids # log[1 - 1] = -inf for x = log(1/2), AND # exp{-x} --> inf, for x << -1 safe_exp_neg_x = tf.exp(-tf.abs(x)) # log1p(z) = log(1 + z) approx z for |z| << 1. This approxmation is used # internally by log1p, rather than being done explicitly here. upper_solution = tf.math.log1p(-0.5 * safe_exp_neg_x) return tf.where(x < 0., lower_solution, upper_solution)
def _hat_integral(self, x, power): """Integral of the `hat` function, used for sampling. We choose a `hat` function, h(x) = x^(-power), which is a continuous (unnormalized) density touching each positive integer at the (unnormalized) pmf. This function implements `hat` integral: H(x) = int_x^inf h(t) dt; which is needed for sampling purposes. Arguments: x: A Tensor of points x at which to evaluate H(x). power: Power that parameterized hat function. Returns: A Tensor containing evaluation H(x) at x. """ x = tf.cast(x, power.dtype) t = power - 1. return tf.exp((-t) * tf.math.log1p(x) - tf.math.log(t))
def _kl_bernoulli_bernoulli(a, b, name=None): """Calculate the batched KL divergence KL(a || b) with a and b ProbitBernoulli. Args: a: instance of a ProbitBernoulli distribution object. b: instance of a ProbitBernoulli distribution object. name: Python `str` name to use for created operations. Default value: `None` (i.e., `'kl_bernoulli_bernoulli'`). Returns: Batchwise KL(a || b) """ with tf.name_scope(name or 'kl_probit_bernoulli_probit_bernoulli'): a_log_probs0, a_log_probs1 = a._outcome_log_probs() # pylint: disable=protected-access b_log_probs0, b_log_probs1 = b._outcome_log_probs() # pylint: disable=protected-access a_prob1 = tf.exp(a_log_probs1) return (1. - a_prob1) * (a_log_probs0 - b_log_probs0) + a_prob1 * ( a_log_probs1 - b_log_probs1)
def _prob(self, y, **kwargs): if not hasattr(self.distribution, "_prob"): return tf.exp(self.log_prob(y, **kwargs)) distribution_kwargs, bijector_kwargs = self._kwargs_split_fn(kwargs) x = self.bijector.inverse(y, **bijector_kwargs) event_ndims = self._maybe_get_static_event_ndims() ildj = self.bijector.inverse_log_det_jacobian(y, event_ndims=event_ndims, **bijector_kwargs) if self.bijector._is_injective: # pylint: disable=protected-access return self._finish_prob_for_one_fiber(y, x, ildj, event_ndims, **distribution_kwargs) prob_on_fibers = [ self._finish_prob_for_one_fiber(y, x_i, ildj_i, event_ndims, **distribution_kwargs) for x_i, ildj_i in zip(x, ildj) ] return sum(prob_on_fibers)
def _kl_laplace_laplace(a, b, name=None): """Calculate the batched KL divergence KL(a || b) with a and b Laplace. Args: a: instance of a Laplace distribution object. b: instance of a Laplace distribution object. name: Python `str` name to use for created operations. Default value: `None` (i.e., `'kl_laplace_laplace'`). Returns: kl_div: Batchwise KL(a || b) """ with tf.name_scope(name or 'kl_laplace_laplace'): # Consistent with # http://www.mast.queensu.ca/~communications/Papers/gil-msc11.pdf, page 38 distance = tf.abs(a.loc - b.loc) a_scale = tf.convert_to_tensor(a.scale) b_scale = tf.convert_to_tensor(b.scale) delta_log_scale = tf.math.log(a_scale) - tf.math.log(b_scale) return (-delta_log_scale + distance / b_scale - 1. + tf.exp(-distance / a_scale + delta_log_scale))
def _inverse(self, y): y = self._maybe_assert_valid(y) return tf.exp( tf.math.log1p(-(1 - y**self.concentration1)**self.concentration0))
def _forward(self, x): x = self._maybe_assert_valid(x) return tf.exp( tf.math.log1p(-tf.exp(tf.math.log1p(-x) / self.concentration0)) / self.concentration1)
def _stddev(self): return tf.exp(0.5 * self._log_variance())
def _variance(self): return tf.exp(self._log_variance())
def _mean(self, distributions=None): if distributions is None: distributions = self.poisson_and_mixture_distributions() dist, mixture_dist = distributions return tf.exp( tf.reduce_logsumexp(mixture_dist.logits + dist.log_rate, axis=-1))
def _variance(self): logits, probs = self._logits_and_probs_no_checks() return tf.exp(-logits) / probs
def _mean(self): return tf.exp(-self._logits_parameter_no_checks())
def _mean(self, df=None): df = tf.convert_to_tensor(self.df if df is None else df) return np.sqrt(2.) * tf.exp( tf.math.lgamma(0.5 * (df + 1.)) - tf.math.lgamma(0.5 * df))
def _rate_parameter_no_checks(self): if self._rate is None: return tf.exp(self._log_rate) return tf.identity(self._rate)
def _log_normalization(self, log_rate): return tf.exp(log_rate)
def _entropy(self): log_probs0, log_probs1 = self._outcome_log_probs() probs1 = tf.exp(log_probs1) return -(1. - probs1) * log_probs0 - probs1 * log_probs1