def _cdf(self, x): low = tf.convert_to_tensor(self.low) high = tf.convert_to_tensor(self.high) peak = tf.convert_to_tensor(self.peak) interval_length = high - low # Due to the PDF being not smooth at the peak, we have to treat each side # somewhat differently. The PDF is two line segments, and thus we get # quadratics here for the CDF. result_inside_interval = tf.where( (x >= low) & (x <= peak), # (x - low) ** 2 / ((high - low) * (peak - low)) tf.math.squared_difference(x, low) / (interval_length * (peak - low)), # 1 - (high - x) ** 2 / ((high - low) * (high - peak)) 1. - tf.math.squared_difference(high, x) / (interval_length * (high - peak))) # We now add that the left tail is 0 and the right tail is 1. result_if_not_big = tf.where(x < low, tf.zeros_like(x), result_inside_interval) return tf.where(x >= high, tf.ones_like(x), result_if_not_big)
def _mode(self): concentration = tf.convert_to_tensor(self.concentration) rate = tf.convert_to_tensor(self.rate) mode = (concentration - 1.) / rate if self.allow_nan_stats: assertions = [] else: assertions = [assert_util.assert_less( tf.ones([], self.dtype), concentration, message='Mode not defined when any concentration <= 1.')] with tf.control_dependencies(assertions): return tf.where( concentration > 1., mode, dtype_util.as_numpy_dtype(self.dtype)(np.nan))
def _prob(self, x): low = tf.convert_to_tensor(self.low) high = tf.convert_to_tensor(self.high) peak = tf.convert_to_tensor(self.peak) if self.validate_args: with tf.control_dependencies([ assert_util.assert_greater_equal(x, low), assert_util.assert_less_equal(x, high) ]): x = tf.identity(x) interval_length = high - low # This is the pdf function when a low <= high <= x. This looks like # a triangle, so we have to treat each line segment separately. result_inside_interval = tf.where( (x >= low) & (x <= peak), # Line segment from (low, 0) to (peak, 2 / (high - low)). 2. * (x - low) / (interval_length * (peak - low)), # Line segment from (peak, 2 / (high - low)) to (high, 0). 2. * (high - x) / (interval_length * (high - peak))) return tf.where((x < low) | (x > high), tf.zeros_like(x), result_inside_interval)
def _mode(self): df = tf.convert_to_tensor(self.df) mode = df - 2. if self.allow_nan_stats: assertions = [] else: assertions = [ assert_util.assert_less( 2. * tf.ones([], self.dtype), df, message='Mode not defined when df <= 2.') ] with tf.control_dependencies(assertions): return tf.where(df > 2., mode, dtype_util.as_numpy_dtype(self.dtype)(np.nan))
def _cdf(self, x): x = tf.convert_to_tensor(x, name='x') flat_x = tf.reshape(x, shape=[-1]) upper_bound = tf.searchsorted(self.outcomes, values=flat_x, side='right') values_at_ub = tf.gather( self.outcomes, indices=tf.minimum( upper_bound, dist_util.prefer_static_shape(self.outcomes)[-1] - 1)) should_use_upper_bound = self._is_equal_or_close(flat_x, values_at_ub) indices = tf.where(should_use_upper_bound, upper_bound, upper_bound - 1) return self._categorical.cdf( tf.reshape(indices, shape=dist_util.prefer_static_shape(x)))
def _mean(self): concentration = tf.convert_to_tensor(self.concentration) scale = tf.convert_to_tensor(self.scale) mean = scale / (concentration - 1.) if self.allow_nan_stats: assertions = [] else: assertions = [ assert_util.assert_less( tf.ones([], self.dtype), concentration, message='mean undefined when any concentration <= 1') ] with tf.control_dependencies(assertions): return tf.where(concentration > 1., mean, dtype_util.as_numpy_dtype(self.dtype)(np.nan))
def _cdf(self, x): # CDF(x) at positive integer x is the probability that the Zipf variable is # less than or equal to x; given by the formula: # CDF(x) = 1 - (zeta(power, x + 1) / Z) # For fractional x, the CDF is equal to the CDF at n = floor(x). # For x < 1, the CDF is zero. # If interpolate_nondiscrete is True, we return a continuous relaxation # which agrees with the CDF at integer points. power = tf.convert_to_tensor(self.power) x = tf.cast(x, power.dtype) safe_x = tf.maximum(x if self.interpolate_nondiscrete else tf.floor(x), 0.) cdf = 1. - (tf.math.zeta(power, safe_x + 1.) / tf.math.zeta(power, 1.)) return tf.where(x < 1., tf.zeros_like(cdf), cdf)
def _mean(self): df = tf.convert_to_tensor(self.df) loc = tf.convert_to_tensor(self.loc) mean = loc * tf.ones(self._batch_shape_tensor(loc=loc), dtype=self.dtype) if self.allow_nan_stats: return tf.where( df > 1., mean, dtype_util.as_numpy_dtype(self.dtype)(np.nan)) else: return distribution_util.with_dependencies([ assert_util.assert_less( tf.ones([], dtype=self.dtype), df, message='mean not defined for components of df <= 1'), ], mean)
def _mode(self): concentration1 = tf.convert_to_tensor(self.concentration1) concentration0 = tf.convert_to_tensor(self.concentration0) mode = (concentration1 - 1.) / (concentration1 + concentration0 - 2.) with tf.control_dependencies([] if self.allow_nan_stats else [ # pylint: disable=g-long-ternary assert_util. assert_less(tf.ones([], dtype=self.dtype), concentration1, message="Mode undefined for concentration1 <= 1."), assert_util. assert_less(tf.ones([], dtype=self.dtype), concentration0, message="Mode undefined for concentration0 <= 1.") ]): return tf.where((concentration1 > 1.) & (concentration0 > 1.), mode, dtype_util.as_numpy_dtype(self.dtype)(np.nan))
def body_fn(w, should_continue): z = beta.sample(sample_shape=sample_batch_shape, seed=seed()) # set_shape needed here because of b/139013403 z.set_shape(w.shape) w = tf.where(should_continue, (1 - (1 + b) * z) / (1 - (1 - b) * z), w) w = tf.debugging.check_numerics(w, 'w') unif = tf.random.uniform(sample_batch_shape, seed=seed(), dtype=self.dtype) # set_shape needed here because of b/139013403 unif.set_shape(w.shape) should_continue = tf.logical_and( should_continue, self.concentration * w + dim * tf.math.log1p(-x * w) - c < tf.math.log(unif)) return w, should_continue
def log_cdf_laplace(x, name="log_cdf_laplace"): """Log Laplace distribution function. This function calculates `Log[L(x)]`, where `L(x)` is the cumulative distribution function of the Laplace distribution, i.e. ```L(x) := 0.5 * int_{-infty}^x e^{-|t|} dt``` For numerical accuracy, `L(x)` is computed in different ways depending on `x`, ``` x <= 0: Log[L(x)] = Log[0.5] + x, which is exact 0 < x: Log[L(x)] = Log[1 - 0.5 * e^{-x}], which is exact ``` Args: x: `Tensor` of type `float32`, `float64`. name: Python string. A name for the operation (default="log_ndtr"). Returns: `Tensor` with `dtype=x.dtype`. Raises: TypeError: if `x.dtype` is not handled. """ with tf.name_scope(name): x = tf.convert_to_tensor(x, name="x") # For x < 0, L(x) = 0.5 * exp{x} exactly, so Log[L(x)] = log(0.5) + x. lower_solution = -np.log(2.) + x # safe_exp_neg_x = exp{-x} for x > 0, but is # bounded above by 1, which avoids # log[1 - 1] = -inf for x = log(1/2), AND # exp{-x} --> inf, for x << -1 safe_exp_neg_x = tf.exp(-tf.abs(x)) # log1p(z) = log(1 + z) approx z for |z| << 1. This approxmation is used # internally by log1p, rather than being done explicitly here. upper_solution = tf.math.log1p(-0.5 * safe_exp_neg_x) return tf.where(x < 0., lower_solution, upper_solution)
def _mode(self): a = tf.convert_to_tensor(self.concentration1) b = tf.convert_to_tensor(self.concentration0) mode = ((a - 1) / (a * b - 1))**(1. / a) if self.allow_nan_stats: return tf.where((a > 1.) & (b > 1.), mode, dtype_util.as_numpy_dtype(self.dtype)(np.nan)) return distribution_util.with_dependencies([ assert_util.assert_less( tf.ones([], dtype=a.dtype), a, message="Mode undefined for concentration1 <= 1."), assert_util.assert_less( tf.ones([], dtype=b.dtype), b, message="Mode undefined for concentration0 <= 1.") ], mode)
def _mean(self): concentration = tf.convert_to_tensor(self.concentration) mixing_concentration = tf.convert_to_tensor(self.mixing_concentration) mixing_rate = tf.convert_to_tensor(self.mixing_rate) mean = concentration * mixing_rate / (mixing_concentration - 1.) if self.allow_nan_stats: return tf.where(mixing_concentration > 1., mean, dtype_util.as_numpy_dtype(self.dtype)(np.nan)) else: with tf.control_dependencies([ assert_util.assert_less( tf.ones([], self.dtype), mixing_concentration, message= 'mean undefined when `mixing_concentration` <= 1'), ]): return tf.identity(mean)
def _mode(self): concentration = tf.convert_to_tensor(self.concentration) k = tf.cast(tf.shape(concentration)[-1], self.dtype) total_concentration = tf.reduce_sum(concentration, axis=-1) mode = (concentration - 1.) / (total_concentration[..., tf.newaxis] - k) if self.allow_nan_stats: return tf.where( tf.reduce_all(concentration > 1., axis=-1, keepdims=True), mode, dtype_util.as_numpy_dtype(self.dtype)(np.nan)) assertions = [ assert_util.assert_less( tf.ones([], self.dtype), concentration, message='Mode undefined when any concentration <= 1') ] with tf.control_dependencies(assertions): return tf.identity(mode)
def _variance(self): concentration = tf.convert_to_tensor(self.concentration) scale = tf.convert_to_tensor(self.scale) var = (tf.square(scale) / tf.square(concentration - 1.) / (concentration - 2.)) if self.allow_nan_stats: assertions = [] else: assertions = [ assert_util.assert_less( tf.constant(2., dtype=self.dtype), concentration, message='variance undefined when any concentration <= 2') ] with tf.control_dependencies(assertions): return tf.where(concentration > 2., var, dtype_util.as_numpy_dtype(self.dtype)(np.nan))
def non_negative_axis(axis, rank, name=None): # pylint:disable=redefined-outer-name """Make (possibly negatively indexed) `axis` argument non-negative.""" with tf.name_scope(name or 'non_negative_axis'): if axis is None: return None if rank is None: raise ValueError('Argument `rank` cannot be `None`.') dtype = dtype_util.as_numpy_dtype( dtype_util.common_dtype([axis, rank], dtype_hint=tf.int32)) rank_ = tf.get_static_value(rank) axis_ = tf.get_static_value(axis) if rank_ is None or axis_ is None: axis = tf.convert_to_tensor(axis, dtype=dtype, name='axis') rank = tf.convert_to_tensor(rank, dtype=dtype, name='rank') return tf.where(axis < 0, rank + axis, axis) axis_ = np.array(axis_, dtype=dtype) rank_ = np.array(rank_, dtype=dtype) return np.where(axis_ < 0, axis_ + rank_, axis_)
def _variance(self): concentration = tf.convert_to_tensor(self.concentration) mixing_concentration = tf.convert_to_tensor(self.mixing_concentration) mixing_rate = tf.convert_to_tensor(self.mixing_rate) variance = (tf.square(concentration * mixing_rate / (mixing_concentration - 1.)) / (mixing_concentration - 2.)) if self.allow_nan_stats: return tf.where(mixing_concentration > 2., variance, dtype_util.as_numpy_dtype(self.dtype)(np.nan)) else: with tf.control_dependencies([ assert_util.assert_less( tf.ones([], self.dtype) * 2., mixing_concentration, message= 'variance undefined when `mixing_concentration` <= 2') ]): return tf.identity(variance)
def _sqrtx2p1(x): """Implementation of `sqrt(1 + x**2)` which is stable despite large `x`.""" sqrt_eps = np.sqrt(np.finfo(dtype_util.as_numpy_dtype(x.dtype)).eps) return tf.where( tf.abs(x) * sqrt_eps <= 1., tf.sqrt(x**2. + 1.), # For large x, calculating x**2 can overflow. This can be alleviated by # considering: # sqrt(1 + x**2) # = exp(0.5 log(1 + x**2)) # = exp(0.5 log(x**2 * (1 + x**-2))) # = exp(log(x) + 0.5 * log(1 + x**-2)) # = |x| * exp(0.5 log(1 + x**-2)) # = |x| * sqrt(1 + x**-2) # We omit the last term in this approximation. # When |x| > 1 / sqrt(machineepsilon), the second term will be 1, # due to sqrt(1 + x**-2) = 1. This is also true with the gradient term, # and higher order gradients, since the first order derivative of # sqrt(1 + x**-2) is -2 * x**-3 / (1 + x**-2) = -2 / (x**3 + x), # and all nth-order derivatives will be O(x**-(n + 2)). This makes any # gradient terms that contain any derivatives of sqrt(1 + x**-2) vanish. tf.abs(x))
def pinv(a, rcond=None, validate_args=False, name=None): """Compute the Moore-Penrose pseudo-inverse of a matrix. Calculate the [generalized inverse of a matrix]( https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse) using its singular-value decomposition (SVD) and including all large singular values. The pseudo-inverse of a matrix `A`, is defined as: 'the matrix that 'solves' [the least-squares problem] `A @ x = b`,' i.e., if `x_hat` is a solution, then `A_pinv` is the matrix such that `x_hat = A_pinv @ b`. It can be shown that if `U @ Sigma @ V.T = A` is the singular value decomposition of `A`, then `A_pinv = V @ inv(Sigma) U^T`. [(Strang, 1980)][1] This function is analogous to [`numpy.linalg.pinv`]( https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.pinv.html). It differs only in default value of `rcond`. In `numpy.linalg.pinv`, the default `rcond` is `1e-15`. Here the default is `10. * max(num_rows, num_cols) * np.finfo(dtype).eps`. Args: a: (Batch of) `float`-like matrix-shaped `Tensor`(s) which are to be pseudo-inverted. rcond: `Tensor` of small singular value cutoffs. Singular values smaller (in modulus) than `rcond` * largest_singular_value (again, in modulus) are set to zero. Must broadcast against `tf.shape(a)[:-2]`. Default value: `10. * max(num_rows, num_cols) * np.finfo(a.dtype).eps`. validate_args: When `True`, additional assertions might be embedded in the graph. Default value: `False` (i.e., no graph assertions are added). name: Python `str` prefixed to ops created by this function. Default value: 'pinv'. Returns: a_pinv: The pseudo-inverse of input `a`. Has same shape as `a` except rightmost two dimensions are transposed. Raises: TypeError: if input `a` does not have `float`-like `dtype`. ValueError: if input `a` has fewer than 2 dimensions. #### Examples ```python from tensorflow_probability.python.internal.backend import numpy as tf import tensorflow_probability as tfp; tfp = tfp.experimental.substrates.numpy a = tf.constant([[1., 0.4, 0.5], [0.4, 0.2, 0.25], [0.5, 0.25, 0.35]]) tf.matmul(tfp.math.pinv(a), a) # ==> array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], dtype=float32) a = tf.constant([[1., 0.4, 0.5, 1.], [0.4, 0.2, 0.25, 2.], [0.5, 0.25, 0.35, 3.]]) tf.matmul(tfp.math.pinv(a), a) # ==> array([[ 0.76, 0.37, 0.21, -0.02], [ 0.37, 0.43, -0.33, 0.02], [ 0.21, -0.33, 0.81, 0.01], [-0.02, 0.02, 0.01, 1. ]], dtype=float32) ``` #### References [1]: G. Strang. 'Linear Algebra and Its Applications, 2nd Ed.' Academic Press, Inc., 1980, pp. 139-142. """ with tf.name_scope(name or 'pinv'): a = tf.convert_to_tensor(a, name='a') assertions = _maybe_validate_matrix(a, validate_args) if assertions: with tf.control_dependencies(assertions): a = tf.identity(a) dtype = dtype_util.as_numpy_dtype(a.dtype) if rcond is None: def get_dim_size(dim): if tf.compat.dimension_value(a.shape[dim]) is not None: return tf.compat.dimension_value(a.shape[dim]) return tf.shape(a)[dim] num_rows = get_dim_size(-2) num_cols = get_dim_size(-1) if isinstance(num_rows, int) and isinstance(num_cols, int): max_rows_cols = float(max(num_rows, num_cols)) else: max_rows_cols = tf.cast(tf.maximum(num_rows, num_cols), dtype) rcond = 10. * max_rows_cols * np.finfo(dtype).eps rcond = tf.convert_to_tensor(rcond, dtype=dtype, name='rcond') # Calculate pseudo inverse via SVD. # Note: if a is symmetric then u == v. (We might observe additional # performance by explicitly setting `v = u` in such cases.) [ singular_values, # Sigma left_singular_vectors, # U right_singular_vectors, # V ] = tf.linalg.svd(a, full_matrices=False, compute_uv=True) # Saturate small singular values to inf. This has the effect of make # `1. / s = 0.` while not resulting in `NaN` gradients. cutoff = rcond * tf.reduce_max(singular_values, axis=-1) singular_values = tf.where(singular_values > cutoff[..., tf.newaxis], singular_values, np.array(np.inf, dtype)) # Although `a == tf.matmul(u, s * v, transpose_b=True)` we swap # `u` and `v` here so that `tf.matmul(pinv(A), A) = tf.eye()`, i.e., # a matrix inverse has 'transposed' semantics. a_pinv = tf.matmul(right_singular_vectors / singular_values[..., tf.newaxis, :], left_singular_vectors, adjoint_b=True) if tensorshape_util.rank(a.shape) is not None: a_pinv.set_shape(a.shape[:-2].concatenate( [a.shape[-1], a.shape[-2]])) return a_pinv
def _slice_single_param(param, param_event_ndims, slices, dist_batch_shape): """Slices a single parameter of a distribution. Args: param: A `Tensor`, the original parameter to slice. param_event_ndims: `int` event parameterization rank for this parameter. slices: A `tuple` of normalized slices. dist_batch_shape: The distribution's batch shape `Tensor`. Returns: new_param: A `Tensor`, batch-sliced according to slices. """ # Extend param shape with ones on the left to match dist_batch_shape. param_shape = tf.shape(input=param) insert_ones = tf.ones( [tf.size(input=dist_batch_shape) + param_event_ndims - tf.rank(param)], dtype=param_shape.dtype) new_param_shape = tf.concat([insert_ones, param_shape], axis=0) full_batch_param = tf.reshape(param, new_param_shape) param_slices = [] # We separately track the batch axis from the parameter axis because we want # them to align for positive indexing, and be offset by param_event_ndims for # negative indexing. param_dim_idx = 0 batch_dim_idx = 0 for slc in slices: if slc is tf.newaxis: param_slices.append(slc) continue if slc is Ellipsis: if batch_dim_idx < 0: raise ValueError( 'Found multiple `...` in slices {}'.format(slices)) param_slices.append(slc) # Switch over to negative indexing for the broadcast check. num_remaining_non_newaxis_slices = sum([ s is not tf.newaxis for s in slices[slices.index(Ellipsis) + 1:] ]) batch_dim_idx = -num_remaining_non_newaxis_slices param_dim_idx = batch_dim_idx - param_event_ndims continue # Find the batch dimension sizes for both parameter and distribution. param_dim_size = new_param_shape[param_dim_idx] batch_dim_size = dist_batch_shape[batch_dim_idx] is_broadcast = batch_dim_size > param_dim_size # Slices are denoted by start:stop:step. if isinstance(slc, slice): start, stop, step = slc.start, slc.stop, slc.step if start is not None: start = tf.where(is_broadcast, 0, start) if stop is not None: stop = tf.where(is_broadcast, 1, stop) if step is not None: step = tf.where(is_broadcast, 1, step) param_slices.append(slice(start, stop, step)) else: # int, or int Tensor, e.g. d[d.batch_shape_tensor()[0] // 2] param_slices.append(tf.where(is_broadcast, 0, slc)) param_dim_idx += 1 batch_dim_idx += 1 param_slices.extend([ALL_SLICE] * param_event_ndims) return full_batch_param.__getitem__(param_slices)
def _mode(self): s = self.df - self.dimension - 1. s = tf.where(s < 0., dtype_util.as_numpy_dtype(s.dtype)(np.nan), s) if self.input_output_cholesky: return tf.sqrt(s) * self.scale_operator.to_dense() return s * self._square_scale_operator()
def _sample_n(self, n, seed=None): power = tf.convert_to_tensor(self.power) shape = tf.concat([[n], tf.shape(power)], axis=0) has_seed = seed is not None seed = SeedStream(seed, salt='zipf') minval_u = self._hat_integral(0.5, power=power) + 1. maxval_u = self._hat_integral(tf.int64.max - 0.5, power=power) def loop_body(should_continue, k): """Resample the non-accepted points.""" # The range of U is chosen so that the resulting sample K lies in # [0, tf.int64.max). The final sample, if accepted, is K + 1. u = tf.random.uniform(shape, minval=minval_u, maxval=maxval_u, dtype=power.dtype, seed=seed()) # Sample the point X from the continuous density h(x) \propto x^(-power). x = self._hat_integral_inverse(u, power=power) # Rejection-inversion requires a `hat` function, h(x) such that # \int_{k - .5}^{k + .5} h(x) dx >= pmf(k + 1) for points k in the # support. A natural hat function for us is h(x) = x^(-power). # # After sampling X from h(x), suppose it lies in the interval # (K - .5, K + .5) for integer K. Then the corresponding K is accepted if # if lies to the left of x_K, where x_K is defined by: # \int_{x_k}^{K + .5} h(x) dx = H(x_K) - H(K + .5) = pmf(K + 1), # where H(x) = \int_x^inf h(x) dx. # Solving for x_K, we find that x_K = H_inverse(H(K + .5) + pmf(K + 1)). # Or, the acceptance condition is X <= H_inverse(H(K + .5) + pmf(K + 1)). # Since X = H_inverse(U), this simplifies to U <= H(K + .5) + pmf(K + 1). # Update the non-accepted points. # Since X \in (K - .5, K + .5), the sample K is chosen as floor(X + 0.5). k = tf.where(should_continue, tf.floor(x + 0.5), k) accept = (u <= self._hat_integral(k + .5, power=power) + tf.exp(self._log_prob(k + 1, power=power))) return [should_continue & (~accept), k] should_continue, samples = tf.while_loop( cond=lambda should_continue, *ignore: tf.reduce_any(should_continue ), body=loop_body, loop_vars=[ tf.ones(shape, dtype=tf.bool), # should_continue tf.zeros(shape, dtype=power.dtype), # k ], parallel_iterations=1 if has_seed else 10, maximum_iterations=self.sample_maximum_iterations, ) samples = samples + 1. if self.validate_args and dtype_util.is_integer(self.dtype): samples = distribution_util.embed_check_integer_casting_closed( samples, target_dtype=self.dtype, assert_positive=True) samples = tf.cast(samples, self.dtype) if self.validate_args: npdt = dtype_util.as_numpy_dtype(self.dtype) v = npdt( dtype_util.min(npdt) if dtype_util.is_integer(npdt) else np.nan ) samples = tf.where(should_continue, v, samples) return samples
def reduce_weighted_logsumexp(logx, w=None, axis=None, keep_dims=False, return_sign=False, name=None): """Computes `log(abs(sum(weight * exp(elements across tensor dimensions))))`. If all weights `w` are known to be positive, it is more efficient to directly use `reduce_logsumexp`, i.e., `tf.reduce_logsumexp(logx + tf.log(w))` is more efficient than `du.reduce_weighted_logsumexp(logx, w)`. Reduces `input_tensor` along the dimensions given in `axis`. Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in `axis`. If `keep_dims` is true, the reduced dimensions are retained with length 1. If `axis` has no entries, all dimensions are reduced, and a tensor with a single element is returned. This function is more numerically stable than log(sum(w * exp(input))). It avoids overflows caused by taking the exp of large inputs and underflows caused by taking the log of small inputs. For example: ```python x = tf.constant([[0., 0, 0], [0, 0, 0]]) w = tf.constant([[-1., 1, 1], [1, 1, 1]]) du.reduce_weighted_logsumexp(x, w) # ==> log(-1*1 + 1*1 + 1*1 + 1*1 + 1*1 + 1*1) = log(4) du.reduce_weighted_logsumexp(x, w, axis=0) # ==> [log(-1+1), log(1+1), log(1+1)] du.reduce_weighted_logsumexp(x, w, axis=1) # ==> [log(-1+1+1), log(1+1+1)] du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True) # ==> [[log(-1+1+1)], [log(1+1+1)]] du.reduce_weighted_logsumexp(x, w, axis=[0, 1]) # ==> log(-1+5) ``` Args: logx: The tensor to reduce. Should have numeric type. w: The weight tensor. Should have numeric type identical to `logx`. axis: The dimensions to reduce. If `None` (the default), reduces all dimensions. Must be in the range `[-rank(input_tensor), rank(input_tensor))`. keep_dims: If true, retains reduced dimensions with length 1. return_sign: If `True`, returns the sign of the result. name: A name for the operation (optional). Returns: lswe: The `log(abs(sum(weight * exp(x))))` reduced tensor. sign: (Optional) The sign of `sum(weight * exp(x))`. """ with tf.name_scope(name or 'reduce_weighted_logsumexp'): logx = tf.convert_to_tensor(logx, name='logx') if w is None: lswe = tf.reduce_logsumexp(logx, axis=axis, keepdims=keep_dims) if return_sign: sgn = tf.ones_like(lswe) return lswe, sgn return lswe w = tf.convert_to_tensor(w, dtype=logx.dtype, name='w') log_absw_x = logx + tf.math.log(tf.abs(w)) max_log_absw_x = tf.reduce_max(log_absw_x, axis=axis, keepdims=True) # If the largest element is `-inf` or `inf` then we don't bother subtracting # off the max. We do this because otherwise we'd get `inf - inf = NaN`. That # this is ok follows from the fact that we're actually free to subtract any # value we like, so long as we add it back after taking the `log(sum(...))`. max_log_absw_x = tf.where( tf.math.is_inf(max_log_absw_x), tf.zeros([], max_log_absw_x.dtype), max_log_absw_x) wx_over_max_absw_x = (tf.sign(w) * tf.exp(log_absw_x - max_log_absw_x)) sum_wx_over_max_absw_x = tf.reduce_sum( wx_over_max_absw_x, axis=axis, keepdims=keep_dims) if not keep_dims: max_log_absw_x = tf.squeeze(max_log_absw_x, axis) sgn = tf.sign(sum_wx_over_max_absw_x) lswe = max_log_absw_x + tf.math.log(sgn * sum_wx_over_max_absw_x) if return_sign: return lswe, sgn return lswe
def _mode(self): total_count = tf.convert_to_tensor(self.total_count) adjusted_count = tf.where(1. < total_count, total_count - 1., tf.zeros_like(total_count)) return tf.floor(adjusted_count * tf.exp(self._logits_parameter_no_checks()))
def _variance(self): concentration = tf.convert_to_tensor(self.concentration) valid_variance = (self.scale**2 * concentration / ((concentration - 1.)**2 * (concentration - 2.))) return tf.where(concentration > 2., valid_variance, dtype_util.as_numpy_dtype(self.dtype)(np.inf))
def _mean(self): concentration = tf.convert_to_tensor(self.concentration) return tf.where(concentration > 1., concentration * self.scale / (concentration - 1), dtype_util.as_numpy_dtype(self.dtype)(np.inf))
def _ndtri(p): """Implements ndtri core logic.""" # Constants used in piece-wise rational approximations. Taken from the cephes # library: # https://root.cern.ch/doc/v608/SpecFuncCephesInv_8cxx_source.html p0 = list(reversed([-5.99633501014107895267E1, 9.80010754185999661536E1, -5.66762857469070293439E1, 1.39312609387279679503E1, -1.23916583867381258016E0])) q0 = list(reversed([1.0, 1.95448858338141759834E0, 4.67627912898881538453E0, 8.63602421390890590575E1, -2.25462687854119370527E2, 2.00260212380060660359E2, -8.20372256168333339912E1, 1.59056225126211695515E1, -1.18331621121330003142E0])) p1 = list(reversed([4.05544892305962419923E0, 3.15251094599893866154E1, 5.71628192246421288162E1, 4.40805073893200834700E1, 1.46849561928858024014E1, 2.18663306850790267539E0, -1.40256079171354495875E-1, -3.50424626827848203418E-2, -8.57456785154685413611E-4])) q1 = list(reversed([1.0, 1.57799883256466749731E1, 4.53907635128879210584E1, 4.13172038254672030440E1, 1.50425385692907503408E1, 2.50464946208309415979E0, -1.42182922854787788574E-1, -3.80806407691578277194E-2, -9.33259480895457427372E-4])) p2 = list(reversed([3.23774891776946035970E0, 6.91522889068984211695E0, 3.93881025292474443415E0, 1.33303460815807542389E0, 2.01485389549179081538E-1, 1.23716634817820021358E-2, 3.01581553508235416007E-4, 2.65806974686737550832E-6, 6.23974539184983293730E-9])) q2 = list(reversed([1.0, 6.02427039364742014255E0, 3.67983563856160859403E0, 1.37702099489081330271E0, 2.16236993594496635890E-1, 1.34204006088543189037E-2, 3.28014464682127739104E-4, 2.89247864745380683936E-6, 6.79019408009981274425E-9])) def _create_polynomial(var, coeffs): """Compute n_th order polynomial via Horner's method.""" coeffs = np.array(coeffs, dtype_util.as_numpy_dtype(var.dtype)) if not coeffs.size: return tf.zeros_like(var) return coeffs[0] + _create_polynomial(var, coeffs[1:]) * var maybe_complement_p = tf.where(p > -np.expm1(-2.), 1. - p, p) # Write in an arbitrary value in place of 0 for p since 0 will cause NaNs # later on. The result from the computation when p == 0 is not used so any # number that doesn't result in NaNs is fine. sanitized_mcp = tf.where( maybe_complement_p <= 0., dtype_util.as_numpy_dtype(p.dtype)(0.5), maybe_complement_p) # Compute x for p > exp(-2): x/sqrt(2pi) = w + w**3 P0(w**2)/Q0(w**2). w = sanitized_mcp - 0.5 ww = w ** 2 x_for_big_p = w + w * ww * (_create_polynomial(ww, p0) / _create_polynomial(ww, q0)) x_for_big_p *= -np.sqrt(2. * np.pi) # Compute x for p <= exp(-2): x = z - log(z)/z - (1/z) P(1/z) / Q(1/z), # where z = sqrt(-2. * log(p)), and P/Q are chosen between two different # arrays based on whether p < exp(-32). z = tf.sqrt(-2. * tf.math.log(sanitized_mcp)) first_term = z - tf.math.log(z) / z second_term_small_p = ( _create_polynomial(1. / z, p2) / _create_polynomial(1. / z, q2) / z) second_term_otherwise = ( _create_polynomial(1. / z, p1) / _create_polynomial(1. / z, q1) / z) x_for_small_p = first_term - second_term_small_p x_otherwise = first_term - second_term_otherwise x = tf.where(sanitized_mcp > np.exp(-2.), x_for_big_p, tf.where(z >= 8.0, x_for_small_p, x_otherwise)) x = tf.where(p > 1. - np.exp(-2.), x, -x) infinity_scalar = tf.constant(np.inf, dtype=p.dtype) x_nan_replaced = tf.where(p <= 0.0, -infinity_scalar, tf.where(p >= 1.0, infinity_scalar, x)) return x_nan_replaced
def log_ndtr(x, series_order=3, name="log_ndtr"): """Log Normal distribution function. For details of the Normal distribution function see `ndtr`. This function calculates `(log o ndtr)(x)` by either calling `log(ndtr(x))` or using an asymptotic series. Specifically: - For `x > upper_segment`, use the approximation `-ndtr(-x)` based on `log(1-x) ~= -x, x << 1`. - For `lower_segment < x <= upper_segment`, use the existing `ndtr` technique and take a log. - For `x <= lower_segment`, we use the series approximation of erf to compute the log CDF directly. The `lower_segment` is set based on the precision of the input: ``` lower_segment = { -20, x.dtype=float64 { -10, x.dtype=float32 upper_segment = { 8, x.dtype=float64 { 5, x.dtype=float32 ``` When `x < lower_segment`, the `ndtr` asymptotic series approximation is: ``` ndtr(x) = scale * (1 + sum) + R_N scale = exp(-0.5 x**2) / (-x sqrt(2 pi)) sum = Sum{(-1)^n (2n-1)!! / (x**2)^n, n=1:N} R_N = O(exp(-0.5 x**2) (2N+1)!! / |x|^{2N+3}) ``` where `(2n-1)!! = (2n-1) (2n-3) (2n-5) ... (3) (1)` is a [double-factorial](https://en.wikipedia.org/wiki/Double_factorial). Args: x: `Tensor` of type `float32`, `float64`. series_order: Positive Python `integer`. Maximum depth to evaluate the asymptotic expansion. This is the `N` above. name: Python string. A name for the operation (default="log_ndtr"). Returns: log_ndtr: `Tensor` with `dtype=x.dtype`. Raises: TypeError: if `x.dtype` is not handled. TypeError: if `series_order` is a not Python `integer.` ValueError: if `series_order` is not in `[0, 30]`. """ if not isinstance(series_order, int): raise TypeError("series_order must be a Python integer.") if series_order < 0: raise ValueError("series_order must be non-negative.") if series_order > 30: raise ValueError("series_order must be <= 30.") with tf.name_scope(name): x = tf.convert_to_tensor(x, name="x") if dtype_util.base_equal(x.dtype, tf.float64): lower_segment = LOGNDTR_FLOAT64_LOWER upper_segment = LOGNDTR_FLOAT64_UPPER elif dtype_util.base_equal(x.dtype, tf.float32): lower_segment = LOGNDTR_FLOAT32_LOWER upper_segment = LOGNDTR_FLOAT32_UPPER else: raise TypeError("x.dtype=%s is not supported." % x.dtype) # The basic idea here was ported from: # https://root.cern.ch/doc/v608/SpecFuncCephesInv_8cxx_source.html # We copy the main idea, with a few changes # * For x >> 1, and X ~ Normal(0, 1), # Log[P[X < x]] = Log[1 - P[X < -x]] approx -P[X < -x], # which extends the range of validity of this function. # * We use one fixed series_order for all of 'x', rather than adaptive. # * Our docstring properly reflects that this is an asymptotic series, not a # Taylor series. We also provided a correct bound on the remainder. # * We need to use the max/min in the _log_ndtr_lower arg to avoid nan when # x=0. This happens even though the branch is unchosen because when x=0 # the gradient of a select involves the calculation 1*dy+0*(-inf)=nan # regardless of whether dy is finite. Note that the minimum is a NOP if # the branch is chosen. return tf.where( x > upper_segment, -_ndtr(-x), # log(1-x) ~= -x, x << 1 tf.where( x > lower_segment, tf.math.log(_ndtr(tf.maximum(x, lower_segment))), _log_ndtr_lower(tf.minimum(x, lower_segment), series_order)))
def _get_safe_input(self, x, loc, scale): safe_value = 0.5 * scale + loc return tf.where(x < loc, safe_value, x)
def _observation_log_probs(self, observations, mask): """Compute and shape tensor of log probs associated with observations..""" # Let E be the underlying event shape # M the number of steps in the HMM # N the number of states of the HMM # # Then the incoming observations have shape # # observations : batch_o [M] E # # and the mask (if present) has shape # # mask : batch_m [M] # # Let this HMM distribution have batch shape batch_d # We need to broadcast all three of these batch shapes together # into the shape batch. # # We need to move the step dimension to the first dimension to make # them suitable for folding or scanning over. # # When we call `log_prob` for our observations we need to # do this for each state the observation could correspond to. # We do this by expanding the dimensions by 1 so we end up with: # # observations : [M] batch [1] [E] # # After calling `log_prob` we get # # observation_log_probs : [M] batch [N] # # We wish to use `mask` to select from this so we also # reshape and broadcast it up to shape # # mask : [M] batch [N] observation_tensor_shape = tf.shape(observations) observation_batch_shape = observation_tensor_shape[:-1 - self. _underlying_event_rank] observation_event_shape = observation_tensor_shape[ -1 - self._underlying_event_rank:] if mask is not None: mask_tensor_shape = tf.shape(mask) mask_batch_shape = mask_tensor_shape[:-1] batch_shape = tf.broadcast_dynamic_shape(observation_batch_shape, self.batch_shape_tensor()) if mask is not None: batch_shape = tf.broadcast_dynamic_shape(batch_shape, mask_batch_shape) observations = tf.broadcast_to( observations, tf.concat([batch_shape, observation_event_shape], axis=0)) observation_rank = tf.rank(observations) underlying_event_rank = self._underlying_event_rank observations = distribution_util.move_dimension( observations, observation_rank - underlying_event_rank - 1, 0) observations = tf.expand_dims(observations, observation_rank - underlying_event_rank) observation_log_probs = self._observation_distribution.log_prob( observations) if mask is not None: mask = tf.broadcast_to( mask, tf.concat([batch_shape, [self._num_steps]], axis=0)) mask = distribution_util.move_dimension(mask, -1, 0) observation_log_probs = tf.where( mask[..., tf.newaxis], tf.zeros_like(observation_log_probs), observation_log_probs) return observation_log_probs