Пример #1
0
    def _sample_n(self, n, seed=None):
        low = tf.convert_to_tensor(self.low)
        high = tf.convert_to_tensor(self.high)
        peak = tf.convert_to_tensor(self.peak)

        stream = SeedStream(seed, salt='triangular')
        shape = tf.concat(
            [[n], self._batch_shape_tensor(low=low, high=high, peak=peak)],
            axis=0)
        samples = tf.random.uniform(shape=shape,
                                    dtype=self.dtype,
                                    seed=stream())
        # We use Inverse CDF sampling here. Because the CDF is a quadratic function,
        # we must use sqrts here.
        interval_length = high - low
        return tf.where(
            # Note the CDF on the left side of the peak is
            # (x - low) ** 2 / ((high - low) * (peak - low)).
            # If we plug in peak for x, we get that the CDF at the peak
            # is (peak - low) / (high - low). Because of this we decide
            # which part of the piecewise CDF we should use based on the cdf samples
            # we drew.
            samples < (peak - low) / interval_length,
            # Inverse of (x - low) ** 2 / ((high - low) * (peak - low)).
            low + tf.sqrt(samples * interval_length * (peak - low)),
            # Inverse of 1 - (high - x) ** 2 / ((high - low) * (high - peak))
            high - tf.sqrt((1. - samples) * interval_length * (high - peak)))
 def _stddev(self):
     if distribution_util.is_diagonal_scale(self.scale):
         return tf.abs(self.scale.diag_part())
     elif (isinstance(self.scale, tf.linalg.LinearOperatorLowRankUpdate)
           and self.scale.is_self_adjoint):
         return tf.sqrt(
             tf.linalg.diag_part(self.scale.matmul(self.scale.to_dense())))
     else:
         return tf.sqrt(
             tf.linalg.diag_part(
                 self.scale.matmul(self.scale.to_dense(),
                                   adjoint_arg=True)))
Пример #3
0
 def _mean(self):
     # Because df is a scalar, we need to expand dimensions to match
     # scale_operator. We use ellipses notation (...) to select all dimensions
     # and add two dimensions to the end.
     df = self.df[..., tf.newaxis, tf.newaxis]
     if self.input_output_cholesky:
         return tf.sqrt(df) * self.scale_operator.to_dense()
     return df * self._square_scale_operator()
Пример #4
0
        def body(m, pchol, perm, matrix_diag):
            """Body of a single `tf.while_loop` iteration."""
            # Here is roughly a numpy, non-batched version of what's going to happen.
            # (See also Algorithm 1 of Harbrecht et al.)
            # 1: maxi = np.argmax(matrix_diag[perm[m:]]) + m
            # 2: maxval = matrix_diag[perm][maxi]
            # 3: perm[m], perm[maxi] = perm[maxi], perm[m]
            # 4: row = matrix[perm[m]][perm[m + 1:]]
            # 5: row -= np.sum(pchol[:m][perm[m + 1:]] * pchol[:m][perm[m]]], axis=-2)
            # 6: pivot = np.sqrt(maxval); row /= pivot
            # 7: row = np.concatenate([[[pivot]], row], -1)
            # 8: matrix_diag[perm[m:]] -= row**2
            # 9: pchol[m, perm[m:]] = row

            # Find the maximal position of the (remaining) permuted diagonal.
            # Steps 1, 2 above.
            permuted_diag = batch_gather(matrix_diag, perm[..., m:])
            maxi = tf.argmax(permuted_diag, axis=-1,
                             output_type=tf.int64)[..., tf.newaxis]
            maxval = batch_gather(permuted_diag, maxi)
            maxi = maxi + m
            maxval = maxval[..., 0]
            # Update perm: Swap perm[...,m] with perm[...,maxi]. Step 3 above.
            perm = _swap_m_with_i(perm, m, maxi)
            # Step 4.
            row = batch_gather(matrix, perm[..., m:m + 1], axis=-2)
            row = batch_gather(row, perm[..., m + 1:])
            # Step 5.
            prev_rows = pchol[..., :m, :]
            prev_rows_perm_m_onward = batch_gather(prev_rows, perm[...,
                                                                   m + 1:])
            prev_rows_pivot_col = batch_gather(prev_rows, perm[..., m:m + 1])
            row -= tf.reduce_sum(prev_rows_perm_m_onward * prev_rows_pivot_col,
                                 axis=-2)[..., tf.newaxis, :]
            # Step 6.
            pivot = tf.sqrt(maxval)[..., tf.newaxis, tf.newaxis]
            # Step 7.
            row = tf.concat([pivot, row / pivot], axis=-1)
            # TODO(b/130899118): Pad grad fails with int64 paddings.
            # Step 8.
            paddings = tf.concat([
                tf.zeros([prefer_static.rank(pchol) - 1, 2], dtype=tf.int32),
                [[tf.cast(m, tf.int32), 0]]
            ],
                                 axis=0)
            diag_update = tf.pad(row**2, paddings=paddings)[..., 0, :]
            reverse_perm = _invert_permutation(perm)
            matrix_diag -= batch_gather(diag_update, reverse_perm)
            # Step 9.
            row = tf.pad(row, paddings=paddings)
            # TODO(bjp): Defer the reverse permutation all-at-once at the end?
            row = batch_gather(row, reverse_perm)
            pchol_shape = pchol.shape
            pchol = tf.concat([pchol[..., :m, :], row, pchol[..., m + 1:, :]],
                              axis=-2)
            tensorshape_util.set_shape(pchol, pchol_shape)
            return m + 1, pchol, perm, matrix_diag
Пример #5
0
 def _variance(self):
     # Because df is a scalar, we need to expand dimensions to match
     # scale_operator. We use ellipses notation (...) to select all dimensions
     # and add two dimensions to the end.
     df = self.df[..., tf.newaxis, tf.newaxis]
     x = tf.sqrt(df) * self._square_scale_operator()
     d = tf.expand_dims(tf.linalg.diag_part(x), -1)
     v = tf.square(x) + tf.matmul(d, d, adjoint_b=True)
     return v
Пример #6
0
def normal_conjugates_known_scale_posterior(prior, scale, s, n):
    """Posterior Normal distribution with conjugate prior on the mean.

  This model assumes that `n` observations (with sum `s`) come from a
  Normal with unknown mean `loc` (described by the Normal `prior`)
  and known variance `scale**2`. The "known scale posterior" is
  the distribution of the unknown `loc`.

  Accepts a prior Normal distribution object, having parameters
  `loc0` and `scale0`, as well as known `scale` values of the predictive
  distribution(s) (also assumed Normal),
  and statistical estimates `s` (the sum(s) of the observations) and
  `n` (the number(s) of observations).

  Returns a posterior (also Normal) distribution object, with parameters
  `(loc', scale'**2)`, where:

  ```
  mu ~ N(mu', sigma'**2)
  sigma'**2 = 1/(1/sigma0**2 + n/sigma**2),
  mu' = (mu0/sigma0**2 + s/sigma**2) * sigma'**2.
  ```

  Distribution parameters from `prior`, as well as `scale`, `s`, and `n`.
  will broadcast in the case of multidimensional sets of parameters.

  Args:
    prior: `Normal` object of type `dtype`:
      the prior distribution having parameters `(loc0, scale0)`.
    scale: tensor of type `dtype`, taking values `scale > 0`.
      The known stddev parameter(s).
    s: Tensor of type `dtype`. The sum(s) of observations.
    n: Tensor of type `int`. The number(s) of observations.

  Returns:
    A new Normal posterior distribution object for the unknown observation
    mean `loc`.

  Raises:
    TypeError: if dtype of `s` does not match `dtype`, or `prior` is not a
      Normal object.
  """
    if not isinstance(prior, normal.Normal):
        raise TypeError("Expected prior to be an instance of type Normal")

    if s.dtype != prior.dtype:
        raise TypeError(
            "Observation sum s.dtype does not match prior dtype: %s vs. %s" %
            (s.dtype, prior.dtype))

    n = tf.cast(n, prior.dtype)
    scale0_2 = tf.square(prior.scale)
    scale_2 = tf.square(scale)
    scalep_2 = 1.0 / (1 / scale0_2 + n / scale_2)
    return normal.Normal(loc=(prior.loc / scale0_2 + s / scale_2) * scalep_2,
                         scale=tf.sqrt(scalep_2))
  def _variance_scale_term(self, total_count=None, concentration=None):
    """Helper to `_covariance` and `_variance` which computes a shared scale."""
    if total_count is None:
      total_count = tf.convert_to_tensor(self._total_count)
    if concentration is None:
      concentration = tf.convert_to_tensor(self._concentration)

    # Expand back the last dim so the shape of _variance_scale_term matches the
    # shape of self.concentration.
    c0 = self._compute_total_concentration(concentration)[..., tf.newaxis]
    return tf.sqrt((1. + c0 / total_count[..., tf.newaxis]) / (1. + c0))
Пример #8
0
def _sqrtx2p1(x):
    """Implementation of `sqrt(1 + x**2)` which is stable despite large `x`."""
    sqrt_eps = np.sqrt(np.finfo(dtype_util.as_numpy_dtype(x.dtype)).eps)
    return tf.where(
        tf.abs(x) * sqrt_eps <= 1.,
        tf.sqrt(x**2. + 1.),
        # For large x, calculating x**2 can overflow. This can be alleviated by
        # considering:
        # sqrt(1 + x**2)
        # = exp(0.5 log(1 + x**2))
        # = exp(0.5 log(x**2 * (1 + x**-2)))
        # = exp(log(x) + 0.5 * log(1 + x**-2))
        # = |x| * exp(0.5 log(1 + x**-2))
        # = |x| * sqrt(1 + x**-2)
        # We omit the last term in this approximation.
        # When |x| > 1 / sqrt(machineepsilon), the second term will be 1,
        # due to sqrt(1 + x**-2) = 1. This is also true with the gradient term,
        # and higher order gradients, since the first order derivative of
        # sqrt(1 + x**-2) is -2 * x**-3 / (1 + x**-2) = -2 / (x**3 + x),
        # and all nth-order derivatives will be O(x**-(n + 2)). This makes any
        # gradient terms that contain any derivatives of sqrt(1 + x**-2) vanish.
        tf.abs(x))
Пример #9
0
def _undo_batch_normalization(x,
                              mean,
                              variance,
                              offset,
                              scale,
                              variance_epsilon,
                              name=None):
  r"""Inverse of tf.nn.batch_normalization.

  Args:
    x: Input `Tensor` of arbitrary dimensionality.
    mean: A mean `Tensor`.
    variance: A variance `Tensor`.
    offset: An offset `Tensor`, often denoted `beta` in equations, or
      None. If present, will be added to the normalized tensor.
    scale: A scale `Tensor`, often denoted `gamma` in equations, or
      `None`. If present, the scale is applied to the normalized tensor.
    variance_epsilon: A small `float` added to the minibatch `variance` to
      prevent dividing by zero.
    name: A name for this operation (optional).

  Returns:
    batch_unnormalized: The de-normalized, de-scaled, de-offset `Tensor`.
  """
  with tf.name_scope(name or 'undo_batch_normalization'):
    # inv = tf.rsqrt(variance + variance_epsilon)
    # if scale is not None:
    #   inv *= scale
    # return x * inv + (
    #     offset - mean * inv if offset is not None else -mean * inv)
    rescale = tf.sqrt(variance + variance_epsilon)
    if scale is not None:
      rescale = rescale / scale
    batch_unnormalized = x * rescale + (
        mean - offset * rescale if offset is not None else mean)
    return batch_unnormalized
Пример #10
0
 def _inverse(self, y):
     with tf.control_dependencies(self._assertions(y)):
         return tf.sqrt(y)
Пример #11
0
 def _mode(self):
     s = self.df - self.dimension - 1.
     s = tf.where(s < 0., dtype_util.as_numpy_dtype(s.dtype)(np.nan), s)
     if self.input_output_cholesky:
         return tf.sqrt(s) * self.scale_operator.to_dense()
     return s * self._square_scale_operator()
Пример #12
0
    def _sample_n(self, n, seed):
        batch_shape = self.batch_shape_tensor()
        event_shape = self.event_shape_tensor()
        batch_ndims = tf.shape(batch_shape)[0]

        ndims = batch_ndims + 3  # sample_ndims=1, event_ndims=2
        shape = tf.concat([[n], batch_shape, event_shape], 0)
        stream = SeedStream(seed, salt="Wishart")

        # Complexity: O(nbk**2)
        x = tf.random.normal(shape=shape,
                             mean=0.,
                             stddev=1.,
                             dtype=self.dtype,
                             seed=stream())

        # Complexity: O(nbk)
        # This parametrization is equivalent to Chi2, i.e.,
        # ChiSquared(k) == Gamma(alpha=k/2, beta=1/2)
        expanded_df = self.df * tf.ones(
            self.scale_operator.batch_shape_tensor(),
            dtype=dtype_util.base_dtype(self.df.dtype))

        g = tf.random.gamma(shape=[n],
                            alpha=self._multi_gamma_sequence(
                                0.5 * expanded_df, self.dimension),
                            beta=0.5,
                            dtype=self.dtype,
                            seed=stream())

        # Complexity: O(nbk**2)
        x = tf.linalg.band_part(x, -1, 0)  # Tri-lower.

        # Complexity: O(nbk)
        x = tf.linalg.set_diag(x, tf.sqrt(g))

        # Make batch-op ready.
        # Complexity: O(nbk**2)
        perm = tf.concat([tf.range(1, ndims), [0]], 0)
        x = tf.transpose(a=x, perm=perm)
        shape = tf.concat(
            [batch_shape, [event_shape[0]], [event_shape[1] * n]], 0)
        x = tf.reshape(x, shape)

        # Complexity: O(nbM) where M is the complexity of the operator solving a
        # vector system. For LinearOperatorLowerTriangular, each matmul is O(k^3) so
        # this step has complexity O(nbk^3).
        x = self.scale_operator.matmul(x)

        # Undo make batch-op ready.
        # Complexity: O(nbk**2)
        shape = tf.concat([batch_shape, event_shape, [n]], 0)
        x = tf.reshape(x, shape)
        perm = tf.concat([[ndims - 1], tf.range(0, ndims - 1)], 0)
        x = tf.transpose(a=x, perm=perm)

        if not self.input_output_cholesky:
            # Complexity: O(nbk**3)
            x = tf.matmul(x, x, adjoint_b=True)

        return x
Пример #13
0
 def _stddev(self):
   return tf.sqrt(self.concentration) / self.rate
Пример #14
0
 def _stddev(self):
   samples = tf.convert_to_tensor(self._samples)
   axis = self._samples_axis
   r = samples - tf.expand_dims(self._mean(samples), axis=axis)
   var = tf.reduce_mean(tf.square(r), axis=axis)
   return tf.sqrt(var)
Пример #15
0
def _ndtri(p):
  """Implements ndtri core logic."""

  # Constants used in piece-wise rational approximations. Taken from the cephes
  # library:
  # https://root.cern.ch/doc/v608/SpecFuncCephesInv_8cxx_source.html
  p0 = list(reversed([-5.99633501014107895267E1,
                      9.80010754185999661536E1,
                      -5.66762857469070293439E1,
                      1.39312609387279679503E1,
                      -1.23916583867381258016E0]))
  q0 = list(reversed([1.0,
                      1.95448858338141759834E0,
                      4.67627912898881538453E0,
                      8.63602421390890590575E1,
                      -2.25462687854119370527E2,
                      2.00260212380060660359E2,
                      -8.20372256168333339912E1,
                      1.59056225126211695515E1,
                      -1.18331621121330003142E0]))
  p1 = list(reversed([4.05544892305962419923E0,
                      3.15251094599893866154E1,
                      5.71628192246421288162E1,
                      4.40805073893200834700E1,
                      1.46849561928858024014E1,
                      2.18663306850790267539E0,
                      -1.40256079171354495875E-1,
                      -3.50424626827848203418E-2,
                      -8.57456785154685413611E-4]))
  q1 = list(reversed([1.0,
                      1.57799883256466749731E1,
                      4.53907635128879210584E1,
                      4.13172038254672030440E1,
                      1.50425385692907503408E1,
                      2.50464946208309415979E0,
                      -1.42182922854787788574E-1,
                      -3.80806407691578277194E-2,
                      -9.33259480895457427372E-4]))
  p2 = list(reversed([3.23774891776946035970E0,
                      6.91522889068984211695E0,
                      3.93881025292474443415E0,
                      1.33303460815807542389E0,
                      2.01485389549179081538E-1,
                      1.23716634817820021358E-2,
                      3.01581553508235416007E-4,
                      2.65806974686737550832E-6,
                      6.23974539184983293730E-9]))
  q2 = list(reversed([1.0,
                      6.02427039364742014255E0,
                      3.67983563856160859403E0,
                      1.37702099489081330271E0,
                      2.16236993594496635890E-1,
                      1.34204006088543189037E-2,
                      3.28014464682127739104E-4,
                      2.89247864745380683936E-6,
                      6.79019408009981274425E-9]))

  def _create_polynomial(var, coeffs):
    """Compute n_th order polynomial via Horner's method."""
    coeffs = np.array(coeffs, dtype_util.as_numpy_dtype(var.dtype))
    if not coeffs.size:
      return tf.zeros_like(var)
    return coeffs[0] + _create_polynomial(var, coeffs[1:]) * var

  maybe_complement_p = tf.where(p > -np.expm1(-2.), 1. - p, p)
  # Write in an arbitrary value in place of 0 for p since 0 will cause NaNs
  # later on. The result from the computation when p == 0 is not used so any
  # number that doesn't result in NaNs is fine.
  sanitized_mcp = tf.where(
      maybe_complement_p <= 0.,
      dtype_util.as_numpy_dtype(p.dtype)(0.5), maybe_complement_p)

  # Compute x for p > exp(-2): x/sqrt(2pi) = w + w**3 P0(w**2)/Q0(w**2).
  w = sanitized_mcp - 0.5
  ww = w ** 2
  x_for_big_p = w + w * ww * (_create_polynomial(ww, p0)
                              / _create_polynomial(ww, q0))
  x_for_big_p *= -np.sqrt(2. * np.pi)

  # Compute x for p <= exp(-2): x = z - log(z)/z - (1/z) P(1/z) / Q(1/z),
  # where z = sqrt(-2. * log(p)), and P/Q are chosen between two different
  # arrays based on whether p < exp(-32).
  z = tf.sqrt(-2. * tf.math.log(sanitized_mcp))
  first_term = z - tf.math.log(z) / z
  second_term_small_p = (
      _create_polynomial(1. / z, p2) /
      _create_polynomial(1. / z, q2) / z)
  second_term_otherwise = (
      _create_polynomial(1. / z, p1) /
      _create_polynomial(1. / z, q1) / z)
  x_for_small_p = first_term - second_term_small_p
  x_otherwise = first_term - second_term_otherwise

  x = tf.where(sanitized_mcp > np.exp(-2.), x_for_big_p,
               tf.where(z >= 8.0, x_for_small_p, x_otherwise))

  x = tf.where(p > 1. - np.exp(-2.), x, -x)
  infinity_scalar = tf.constant(np.inf, dtype=p.dtype)
  x_nan_replaced = tf.where(p <= 0.0, -infinity_scalar,
                            tf.where(p >= 1.0, infinity_scalar, x))
  return x_nan_replaced
Пример #16
0
def fill_triangular(x, upper=False, name=None):
    """Creates a (batch of) triangular matrix from a vector of inputs.

  Created matrix can be lower- or upper-triangular. (It is more efficient to
  create the matrix as upper or lower, rather than transpose.)

  Triangular matrix elements are filled in a clockwise spiral. See example,
  below.

  If `x.shape` is `[b1, b2, ..., bB, d]` then the output shape is
  `[b1, b2, ..., bB, n, n]` where `n` is such that `d = n(n+1)/2`, i.e.,
  `n = int(np.sqrt(0.25 + 2. * m) - 0.5)`.

  Example:

  ```python
  fill_triangular([1, 2, 3, 4, 5, 6])
  # ==> [[4, 0, 0],
  #      [6, 5, 0],
  #      [3, 2, 1]]

  fill_triangular([1, 2, 3, 4, 5, 6], upper=True)
  # ==> [[1, 2, 3],
  #      [0, 5, 6],
  #      [0, 0, 4]]
  ```

  The key trick is to create an upper triangular matrix by concatenating `x`
  and a tail of itself, then reshaping.

  Suppose that we are filling the upper triangle of an `n`-by-`n` matrix `M`
  from a vector `x`. The matrix `M` contains n**2 entries total. The vector `x`
  contains `n * (n+1) / 2` entries. For concreteness, we'll consider `n = 5`
  (so `x` has `15` entries and `M` has `25`). We'll concatenate `x` and `x` with
  the first (`n = 5`) elements removed and reversed:

  ```python
  x = np.arange(15) + 1
  xc = np.concatenate([x, x[5:][::-1]])
  # ==> array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 14, 13,
  #            12, 11, 10, 9, 8, 7, 6])

  # (We add one to the arange result to disambiguate the zeros below the
  # diagonal of our upper-triangular matrix from the first entry in `x`.)

  # Now, when reshapedlay this out as a matrix:
  y = np.reshape(xc, [5, 5])
  # ==> array([[ 1,  2,  3,  4,  5],
  #            [ 6,  7,  8,  9, 10],
  #            [11, 12, 13, 14, 15],
  #            [15, 14, 13, 12, 11],
  #            [10,  9,  8,  7,  6]])

  # Finally, zero the elements below the diagonal:
  y = np.triu(y, k=0)
  # ==> array([[ 1,  2,  3,  4,  5],
  #            [ 0,  7,  8,  9, 10],
  #            [ 0,  0, 13, 14, 15],
  #            [ 0,  0,  0, 12, 11],
  #            [ 0,  0,  0,  0,  6]])
  ```

  From this example we see that the resuting matrix is upper-triangular, and
  contains all the entries of x, as desired. The rest is details:
  - If `n` is even, `x` doesn't exactly fill an even number of rows (it fills
    `n / 2` rows and half of an additional row), but the whole scheme still
    works.
  - If we want a lower triangular matrix instead of an upper triangular,
    we remove the first `n` elements from `x` rather than from the reversed
    `x`.

  For additional comparisons, a pure numpy version of this function can be found
  in `distribution_util_test.py`, function `_fill_triangular`.

  Args:
    x: `Tensor` representing lower (or upper) triangular elements.
    upper: Python `bool` representing whether output matrix should be upper
      triangular (`True`) or lower triangular (`False`, default).
    name: Python `str`. The name to give this op.

  Returns:
    tril: `Tensor` with lower (or upper) triangular elements filled from `x`.

  Raises:
    ValueError: if `x` cannot be mapped to a triangular matrix.
  """

    with tf.name_scope(name or 'fill_triangular'):
        x = tf.convert_to_tensor(x, name='x')
        m = tf.compat.dimension_value(
            tensorshape_util.with_rank_at_least(x.shape, 1)[-1])
        if m is not None:
            # Formula derived by solving for n: m = n(n+1)/2.
            m = np.int32(m)
            n = np.sqrt(0.25 + 2. * m) - 0.5
            if n != np.floor(n):
                raise ValueError(
                    'Input right-most shape ({}) does not '
                    'correspond to a triangular matrix.'.format(m))
            n = np.int32(n)
            static_final_shape = tensorshape_util.concatenate(
                x.shape[:-1], [n, n])
        else:
            m = tf.shape(x)[-1]
            # For derivation, see above. Casting automatically lops off the 0.5, so we
            # omit it.  We don't validate n is an integer because this has
            # graph-execution cost; an error will be thrown from the reshape, below.
            n = tf.cast(tf.sqrt(0.25 + tf.cast(2 * m, dtype=tf.float32)),
                        dtype=tf.int32)
            static_final_shape = tensorshape_util.concatenate(
                tensorshape_util.with_rank_at_least(x.shape, 1)[:-1],
                [None, None])

        # Try it out in numpy:
        #  n = 3
        #  x = np.arange(n * (n + 1) / 2)
        #  m = x.shape[0]
        #  n = np.int32(np.sqrt(.25 + 2 * m) - .5)
        #  x_tail = x[(m - (n**2 - m)):]
        #  np.concatenate([x_tail, x[::-1]], 0).reshape(n, n)  # lower
        #  # ==> array([[3, 4, 5],
        #               [5, 4, 3],
        #               [2, 1, 0]])
        #  np.concatenate([x, x_tail[::-1]], 0).reshape(n, n)  # upper
        #  # ==> array([[0, 1, 2],
        #               [3, 4, 5],
        #               [5, 4, 3]])
        #
        # Note that we can't simply do `x[..., -(n**2 - m):]` because this doesn't
        # correctly handle `m == n == 1`. Hence, we do nonnegative indexing.
        # Furthermore observe that:
        #   m - (n**2 - m)
        #   = n**2 / 2 + n / 2 - (n**2 - n**2 / 2 + n / 2)
        #   = 2 (n**2 / 2 + n / 2) - n**2
        #   = n**2 + n - n**2
        #   = n
        ndims = prefer_static.rank(x)
        if upper:
            x_list = [x, tf.reverse(x[..., n:], axis=[ndims - 1])]
        else:
            x_list = [x[..., n:], tf.reverse(x, axis=[ndims - 1])]
        new_shape = (tensorshape_util.as_list(static_final_shape)
                     if tensorshape_util.is_fully_defined(static_final_shape)
                     else tf.concat([tf.shape(x)[:-1], [n, n]], axis=0))
        x = tf.reshape(tf.concat(x_list, axis=-1), new_shape)
        x = tf.linalg.band_part(x,
                                num_lower=(0 if upper else -1),
                                num_upper=(-1 if upper else 0))
        tensorshape_util.set_shape(x, static_final_shape)
        return x
Пример #17
0
    def _sample_n(self, n, seed=None):
        seed = SeedStream(seed, salt='vom_mises_fisher')
        # The sampling strategy relies on the fact that vMF variates are symmetric
        # about the mean direction. Accordingly, if we have a sampling strategy for
        # the away-from-mean angle, then we can uniformly sample the remaining
        # dimensions on the S^{dim-2} sphere for , and rotate these samples from a
        # (1, 0, 0, ..., 0)-mode distribution into the target orientation.
        #
        # This is easy to imagine on the 1-sphere (S^1; in 2-D space): sample a
        # von-Mises distributed `x` value in [-1, 1], then uniformly select what
        # amounts to a "up" or "down" additional degree of freedom after unit
        # normalizing, followed by a final rotation to the desired mean direction
        # from a basis of (1, 0).
        #
        # On S^2 (in 3-D), selecting a vMF `x` identifies a circle in `yz` on the
        # unit sphere over which the distribution is uniform, in particular the
        # circle where x = \hat{x} intersects the unit sphere. We pick a point on
        # that circle, then rotate to the desired mean direction from a basis of
        # (1, 0, 0).
        event_dim = (tf.compat.dimension_value(self.event_shape[0])
                     or self._event_shape_tensor()[0])

        sample_batch_shape = tf.concat([[n], self._batch_shape_tensor()],
                                       axis=0)
        dim = tf.cast(event_dim - 1, self.dtype)
        if event_dim == 3:
            samples_dim0 = self._sample_3d(n, seed=seed)
        else:
            # Wood'94 provides a rejection algorithm to sample the x coordinate.
            # Wood'94 definition of b:
            # b = (-2 * kappa + tf.sqrt(4 * kappa**2 + dim**2)) / dim
            # https://stats.stackexchange.com/questions/156729 suggests:
            b = dim / (2 * self.concentration +
                       tf.sqrt(4 * self.concentration**2 + dim**2))
            # TODO(bjp): Integrate any useful numerical tricks from hyperspherical VAE
            #     https://github.com/nicola-decao/s-vae-tf/
            x = (1 - b) / (1 + b)
            c = self.concentration * x + dim * tf.math.log1p(-x**2)
            beta = beta_lib.Beta(dim / 2, dim / 2)

            def cond_fn(w, should_continue):
                del w
                return tf.reduce_any(should_continue)

            def body_fn(w, should_continue):
                z = beta.sample(sample_shape=sample_batch_shape, seed=seed())
                # set_shape needed here because of b/139013403
                z.set_shape(w.shape)
                w = tf.where(should_continue,
                             (1 - (1 + b) * z) / (1 - (1 - b) * z), w)
                w = tf.debugging.check_numerics(w, 'w')
                unif = tf.random.uniform(sample_batch_shape,
                                         seed=seed(),
                                         dtype=self.dtype)
                # set_shape needed here because of b/139013403
                unif.set_shape(w.shape)
                should_continue = tf.logical_and(
                    should_continue,
                    self.concentration * w + dim * tf.math.log1p(-x * w) - c <
                    tf.math.log(unif))
                return w, should_continue

            w = tf.zeros(sample_batch_shape, dtype=self.dtype)
            should_continue = tf.ones(sample_batch_shape, dtype=tf.bool)
            samples_dim0 = tf.while_loop(cond=cond_fn,
                                         body=body_fn,
                                         loop_vars=(w, should_continue))[0]
            samples_dim0 = samples_dim0[..., tf.newaxis]
        if not self._allow_nan_stats:
            # Verify samples are w/in -1, 1, with useful error output tensors (top
            # value rather than all values).
            with tf.control_dependencies([
                    assert_util.assert_less_equal(
                        samples_dim0,
                        dtype_util.as_numpy_dtype(self.dtype)(1.01),
                        data=[
                            tf.math.top_k(tf.reshape(samples_dim0, [-1]))[0]
                        ]),
                    assert_util.assert_greater_equal(
                        samples_dim0,
                        dtype_util.as_numpy_dtype(self.dtype)(-1.01),
                        data=[
                            -tf.math.top_k(tf.reshape(-samples_dim0, [-1]))[0]
                        ])
            ]):
                samples_dim0 = tf.identity(samples_dim0)
        samples_otherdims_shape = tf.concat(
            [sample_batch_shape, [event_dim - 1]], axis=0)
        unit_otherdims = tf.math.l2_normalize(tf.random.normal(
            samples_otherdims_shape, seed=seed(), dtype=self.dtype),
                                              axis=-1)
        samples = tf.concat(
            [
                samples_dim0,  # we must avoid sqrt(1 - (>1)**2)
                tf.sqrt(tf.maximum(1 - samples_dim0**2, 0.)) * unit_otherdims
            ],
            axis=-1)
        samples = tf.math.l2_normalize(samples, axis=-1)
        if not self._allow_nan_stats:
            samples = tf.debugging.check_numerics(samples, 'samples')

        # Runtime assert that samples are unit length.
        if not self._allow_nan_stats:
            worst, idx = tf.math.top_k(
                tf.reshape(tf.abs(1 - tf.linalg.norm(samples, axis=-1)), [-1]))
            with tf.control_dependencies([
                    assert_util.assert_near(
                        dtype_util.as_numpy_dtype(self.dtype)(0),
                        worst,
                        data=[
                            worst, idx,
                            tf.gather(tf.reshape(samples, [-1, event_dim]),
                                      idx)
                        ],
                        atol=1e-4,
                        summarize=100)
            ]):
                samples = tf.identity(samples)
        # The samples generated are symmetric around a mode at (1, 0, 0, ...., 0).
        # Now, we move the mode to `self.mean_direction` using a rotation matrix.
        if not self._allow_nan_stats:
            # Assert that the basis vector rotates to the mean direction, as expected.
            basis = tf.cast(
                tf.concat([[1.], tf.zeros([event_dim - 1])], axis=0),
                self.dtype)
            with tf.control_dependencies([
                    assert_util.assert_less(
                        tf.linalg.norm(self._rotate(basis) -
                                       self.mean_direction,
                                       axis=-1),
                        dtype_util.as_numpy_dtype(self.dtype)(1e-5))
            ]):
                return self._rotate(samples)
        return self._rotate(samples)
Пример #18
0
    def _sample_n(self, num_samples, seed=None, name=None):
        """Returns a Tensor of samples from an LKJ distribution.

    Args:
      num_samples: Python `int`. The number of samples to draw.
      seed: Python integer seed for RNG
      name: Python `str` name prefixed to Ops created by this function.

    Returns:
      samples: A Tensor of correlation matrices with shape `[n, B, D, D]`,
        where `B` is the shape of the `concentration` parameter, and `D`
        is the `dimension`.

    Raises:
      ValueError: If `dimension` is negative.
    """
        if self.dimension < 0:
            raise ValueError(
                'Cannot sample negative-dimension correlation matrices.')
        # Notation below: B is the batch shape, i.e., tf.shape(concentration)
        seed = SeedStream(seed, 'sample_lkj')
        with tf.name_scope('sample_lkj' or name):
            concentration = tf.convert_to_tensor(self.concentration)
            if not dtype_util.is_floating(concentration.dtype):
                raise TypeError(
                    'The concentration argument should have floating type, not '
                    '{}'.format(dtype_util.name(concentration.dtype)))

            concentration = _replicate(num_samples, concentration)
            concentration_shape = tf.shape(concentration)
            if self.dimension <= 1:
                # For any dimension <= 1, there is only one possible correlation matrix.
                shape = tf.concat(
                    [concentration_shape, [self.dimension, self.dimension]],
                    axis=0)
                return tf.ones(shape=shape, dtype=concentration.dtype)
            beta_conc = concentration + (self.dimension - 2.) / 2.
            beta_dist = beta.Beta(concentration1=beta_conc,
                                  concentration0=beta_conc)

            # Note that the sampler below deviates from [1], by doing the sampling in
            # cholesky space. This does not change the fundamental logic of the
            # sampler, but does speed up the sampling.

            # This is the correlation coefficient between the first two dimensions.
            # This is also `r` in reference [1].
            corr12 = 2. * beta_dist.sample(seed=seed()) - 1.

            # Below we construct the Cholesky of the initial 2x2 correlation matrix,
            # which is of the form:
            # [[1, 0], [r, sqrt(1 - r**2)]], where r is the correlation between the
            # first two dimensions.
            # This is the top-left corner of the cholesky of the final sample.
            first_row = tf.concat([
                tf.ones_like(corr12)[..., tf.newaxis],
                tf.zeros_like(corr12)[..., tf.newaxis]
            ],
                                  axis=-1)
            second_row = tf.concat([
                corr12[..., tf.newaxis],
                tf.sqrt(1 - corr12**2)[..., tf.newaxis]
            ],
                                   axis=-1)

            chol_result = tf.concat([
                first_row[..., tf.newaxis, :], second_row[..., tf.newaxis, :]
            ],
                                    axis=-2)

            for n in range(2, self.dimension):
                # Loop invariant: on entry, result has shape B + [n, n]
                beta_conc = beta_conc - 0.5
                # norm is y in reference [1].
                norm = beta.Beta(concentration1=n / 2.,
                                 concentration0=beta_conc).sample(seed=seed())
                # distance shape: B + [1] for broadcast
                distance = tf.sqrt(norm)[..., tf.newaxis]
                # direction is u in reference [1].
                # direction shape: B + [n]
                direction = _uniform_unit_norm(n, concentration_shape,
                                               concentration.dtype, seed)
                # raw_correlation is w in reference [1].
                raw_correlation = distance * direction  # shape: B + [n]

                # This is the next row in the cholesky of the result,
                # which differs from the construction in reference [1].
                # In the reference, the new row `z` = chol_result @ raw_correlation^T
                # = C @ raw_correlation^T (where as short hand we use C = chol_result).
                # We prove that the below equation is the right row to add to the
                # cholesky, by showing equality with reference [1].
                # Let S be the sample constructed so far, and let `z` be as in
                # reference [1]. Then at this iteration, the new sample S' will be
                # [[S z^T]
                #  [z 1]]
                # In our case we have the cholesky decomposition factor C, so
                # we want our new row x (same size as z) to satisfy:
                #  [[S z^T]  [[C 0]    [[C^T  x^T]         [[CC^T  Cx^T]
                #   [z 1]] =  [x k]]    [0     k]]  =       [xC^t   xx^T + k**2]]
                # Since C @ raw_correlation^T = z = C @ x^T, and C is invertible,
                # we have that x = raw_correlation. Also 1 = xx^T + k**2, so k
                # = sqrt(1 - xx^T) = sqrt(1 - |raw_correlation|**2) = sqrt(1 -
                # distance**2).
                new_row = tf.concat(
                    [raw_correlation,
                     tf.sqrt(1. - norm[..., tf.newaxis])],
                    axis=-1)

                # Finally add this new row, by growing the cholesky of the result.
                chol_result = tf.concat([
                    chol_result,
                    tf.zeros_like(chol_result[..., 0][..., tf.newaxis])
                ],
                                        axis=-1)

                chol_result = tf.concat(
                    [chol_result, new_row[..., tf.newaxis, :]], axis=-2)

            if self.input_output_cholesky:
                return chol_result

            result = tf.matmul(chol_result, chol_result, transpose_b=True)
            # The diagonal for a correlation matrix should always be ones. Due to
            # numerical instability the matmul might not achieve that, so manually set
            # these to ones.
            result = tf.linalg.set_diag(
                result, tf.ones(shape=tf.shape(result)[:-1],
                                dtype=result.dtype))
            # This sampling algorithm can produce near-PSD matrices on which standard
            # algorithms such as `tf.cholesky` or `tf.linalg.self_adjoint_eigvals`
            # fail. Specifically, as documented in b/116828694, around 2% of trials
            # of 900,000 5x5 matrices (distributed according to 9 different
            # concentration parameter values) contained at least one matrix on which
            # the Cholesky decomposition failed.
            return result