Exemplo n.º 1
0
  def testKLRaises(self):
    ind1 = independent_lib.Independent(
        distribution=normal_lib.Normal(
            loc=np.float32([-1., 1]),
            scale=np.float32([0.1, 0.5])),
        reinterpreted_batch_ndims=1)
    ind2 = independent_lib.Independent(
        distribution=normal_lib.Normal(
            loc=np.float32(-1),
            scale=np.float32(0.5)),
        reinterpreted_batch_ndims=0)

    with self.assertRaisesRegexp(
        ValueError, "Event shapes do not match"):
      kullback_leibler.kl_divergence(ind1, ind2)

    ind1 = independent_lib.Independent(
        distribution=normal_lib.Normal(
            loc=np.float32([-1., 1]),
            scale=np.float32([0.1, 0.5])),
        reinterpreted_batch_ndims=1)
    ind2 = independent_lib.Independent(
        distribution=mvn_diag_lib.MultivariateNormalDiag(
            loc=np.float32([-1., 1]),
            scale_diag=np.float32([0.1, 0.5])),
        reinterpreted_batch_ndims=0)

    with self.assertRaisesRegexp(
        NotImplementedError, "different event shapes"):
      kullback_leibler.kl_divergence(ind1, ind2)
Exemplo n.º 2
0
  def testBetaBetaKL(self):
    with self.test_session() as sess:
      for shape in [(10,), (4, 5)]:
        a1 = 6.0 * np.random.random(size=shape) + 1e-4
        b1 = 6.0 * np.random.random(size=shape) + 1e-4
        a2 = 6.0 * np.random.random(size=shape) + 1e-4
        b2 = 6.0 * np.random.random(size=shape) + 1e-4
        # Take inverse softplus of values to test BetaWithSoftplusConcentration
        a1_sp = np.log(np.exp(a1) - 1.0)
        b1_sp = np.log(np.exp(b1) - 1.0)
        a2_sp = np.log(np.exp(a2) - 1.0)
        b2_sp = np.log(np.exp(b2) - 1.0)

        d1 = beta_lib.Beta(concentration1=a1, concentration0=b1)
        d2 = beta_lib.Beta(concentration1=a2, concentration0=b2)
        d1_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a1_sp,
                                                       concentration0=b1_sp)
        d2_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a2_sp,
                                                       concentration0=b2_sp)

        kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1) +
                       (a1 - a2) * special.digamma(a1) +
                       (b1 - b2) * special.digamma(b1) +
                       (a2 - a1 + b2 - b1) * special.digamma(a1 + b1))

        for dist1 in [d1, d1_sp]:
          for dist2 in [d2, d2_sp]:
            kl = kullback_leibler.kl_divergence(dist1, dist2)
            kl_val = sess.run(kl)
            self.assertEqual(kl.get_shape(), shape)
            self.assertAllClose(kl_val, kl_expected)

        # Make sure KL(d1||d1) is 0
        kl_same = sess.run(kullback_leibler.kl_divergence(d1, d1))
        self.assertAllClose(kl_same, np.zeros_like(kl_expected))
Exemplo n.º 3
0
  def testDirichletDirichletKL(self):
    conc1 = np.array([[1., 2., 3., 1.5, 2.5, 3.5],
                      [1.5, 2.5, 3.5, 4.5, 5.5, 6.5]])
    conc2 = np.array([[0.5, 1., 1.5, 2., 2.5, 3.]])

    d1 = dirichlet_lib.Dirichlet(conc1)
    d2 = dirichlet_lib.Dirichlet(conc2)
    x = d1.sample(int(1e4), seed=0)
    kl_sample = math_ops.reduce_mean(d1.log_prob(x) - d2.log_prob(x), 0)
    kl_actual = kullback_leibler.kl_divergence(d1, d2)

    kl_sample_val = self.evaluate(kl_sample)
    kl_actual_val = self.evaluate(kl_actual)

    self.assertEqual(conc1.shape[:-1], kl_actual.get_shape())

    if not special:
      return

    kl_expected = (
        special.gammaln(np.sum(conc1, -1))
        - special.gammaln(np.sum(conc2, -1))
        - np.sum(special.gammaln(conc1) - special.gammaln(conc2), -1)
        + np.sum((conc1 - conc2) * (special.digamma(conc1) - special.digamma(
            np.sum(conc1, -1, keepdims=True))), -1))

    self.assertAllClose(kl_expected, kl_actual_val, atol=0., rtol=1e-6)
    self.assertAllClose(kl_sample_val, kl_actual_val, atol=0., rtol=1e-1)

    # Make sure KL(d1||d1) is 0
    kl_same = self.evaluate(kullback_leibler.kl_divergence(d1, d1))
    self.assertAllClose(kl_same, np.zeros_like(kl_expected))
  def testDomainErrorExceptions(self):

    class MyDistException(normal.Normal):
      pass

    # Register KL to a lambda that spits out the name parameter
    @kullback_leibler.RegisterKL(MyDistException, MyDistException)
    # pylint: disable=unused-argument,unused-variable
    def _kl(a, b, name=None):
      return array_ops.identity([float("nan")])

    # pylint: disable=unused-argument,unused-variable

    with self.cached_session():
      a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=False)
      kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
      with self.assertRaisesOpError(
          "KL calculation between .* and .* returned NaN values"):
        self.evaluate(kl)
      with self.assertRaisesOpError(
          "KL calculation between .* and .* returned NaN values"):
        a.kl_divergence(a).eval()
      a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=True)
      kl_ok = kullback_leibler.kl_divergence(a, a)
      self.assertAllEqual([float("nan")], self.evaluate(kl_ok))
      self_kl_ok = a.kl_divergence(a)
      self.assertAllEqual([float("nan")], self.evaluate(self_kl_ok))
      cross_ok = a.cross_entropy(a)
      self.assertAllEqual([float("nan")], self.evaluate(cross_ok))
  def testCategoricalCategoricalKL(self):
    def np_softmax(logits):
      exp_logits = np.exp(logits)
      return exp_logits / exp_logits.sum(axis=-1, keepdims=True)

    with self.cached_session() as sess:
      for categories in [2, 10]:
        for batch_size in [1, 2]:
          p_logits = self._rng.random_sample((batch_size, categories))
          q_logits = self._rng.random_sample((batch_size, categories))
          p = onehot_categorical.OneHotCategorical(logits=p_logits)
          q = onehot_categorical.OneHotCategorical(logits=q_logits)
          prob_p = np_softmax(p_logits)
          prob_q = np_softmax(q_logits)
          kl_expected = np.sum(
              prob_p * (np.log(prob_p) - np.log(prob_q)), axis=-1)

          kl_actual = kullback_leibler.kl_divergence(p, q)
          kl_same = kullback_leibler.kl_divergence(p, p)
          x = p.sample(int(2e4), seed=0)
          x = math_ops.cast(x, dtype=dtypes.float32)
          # Compute empirical KL(p||q).
          kl_sample = math_ops.reduce_mean(p.log_prob(x) - q.log_prob(x), 0)

          [kl_sample_, kl_actual_, kl_same_] = sess.run([kl_sample, kl_actual,
                                                         kl_same])
          self.assertEqual(kl_actual.get_shape(), (batch_size,))
          self.assertAllClose(kl_same_, np.zeros_like(kl_expected))
          self.assertAllClose(kl_actual_, kl_expected, atol=0., rtol=1e-6)
          self.assertAllClose(kl_sample_, kl_expected, atol=1e-2, rtol=0.)
Exemplo n.º 6
0
  def testCategoricalCategoricalKL(self):

    def np_softmax(logits):
      exp_logits = np.exp(logits)
      return exp_logits / exp_logits.sum(axis=-1, keepdims=True)

    with self.cached_session() as sess:
      for categories in [2, 4]:
        for batch_size in [1, 10]:
          a_logits = np.random.randn(batch_size, categories)
          b_logits = np.random.randn(batch_size, categories)

          a = categorical.Categorical(logits=a_logits)
          b = categorical.Categorical(logits=b_logits)

          kl = kullback_leibler.kl_divergence(a, b)
          kl_val = sess.run(kl)
          # Make sure KL(a||a) is 0
          kl_same = sess.run(kullback_leibler.kl_divergence(a, a))

          prob_a = np_softmax(a_logits)
          prob_b = np_softmax(b_logits)
          kl_expected = np.sum(prob_a * (np.log(prob_a) - np.log(prob_b)),
                               axis=-1)

          self.assertEqual(kl.get_shape(), (batch_size,))
          self.assertAllClose(kl_val, kl_expected)
          self.assertAllClose(kl_same, np.zeros_like(kl_expected))
Exemplo n.º 7
0
def _kl_independent(a, b, name="kl_independent"):
  """Batched KL divergence `KL(a || b)` for Independent distributions.

  We can leverage the fact that
  ```
  KL(Independent(a) || Independent(b)) = sum(KL(a || b))
  ```
  where the sum is over the `reinterpreted_batch_ndims`.

  Args:
    a: Instance of `Independent`.
    b: Instance of `Independent`.
    name: (optional) name to use for created ops. Default "kl_independent".

  Returns:
    Batchwise `KL(a || b)`.

  Raises:
    ValueError: If the event space for `a` and `b`, or their underlying
      distributions don't match.
  """
  p = a.distribution
  q = b.distribution

  # The KL between any two (non)-batched distributions is a scalar.
  # Given that the KL between two factored distributions is the sum, i.e.
  # KL(p1(x)p2(y) || q1(x)q2(y)) = KL(p1 || q1) + KL(q1 || q2), we compute
  # KL(p || q) and do a `reduce_sum` on the reinterpreted batch dimensions.
  if a.event_shape.is_fully_defined() and b.event_shape.is_fully_defined():
    if a.event_shape == b.event_shape:
      if p.event_shape == q.event_shape:
        num_reduce_dims = a.event_shape.ndims - p.event_shape.ndims
        reduce_dims = [-i - 1 for i in range(0, num_reduce_dims)]

        return math_ops.reduce_sum(
            kullback_leibler.kl_divergence(p, q, name=name), axis=reduce_dims)
      else:
        raise NotImplementedError("KL between Independents with different "
                                  "event shapes not supported.")
    else:
      raise ValueError("Event shapes do not match.")
  else:
    with ops.control_dependencies([
        check_ops.assert_equal(a.event_shape_tensor(), b.event_shape_tensor()),
        check_ops.assert_equal(p.event_shape_tensor(), q.event_shape_tensor())
    ]):
      num_reduce_dims = (
          array_ops.shape(a.event_shape_tensor()[0]) -
          array_ops.shape(p.event_shape_tensor()[0]))
      reduce_dims = math_ops.range(-num_reduce_dims - 1, -1, 1)
      return math_ops.reduce_sum(
          kullback_leibler.kl_divergence(p, q, name=name), axis=reduce_dims)
    def test_kl_reverse(self):
        with self.test_session() as sess:

            q = normal_lib.Normal(loc=np.ones(6),
                                  scale=np.array(
                                      [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]))

            p = normal_lib.Normal(loc=q.loc + 0.1, scale=q.scale - 0.2)

            approx_kl = cd.monte_carlo_csiszar_f_divergence(
                f=cd.kl_reverse,
                p_log_prob=p.log_prob,
                q=q,
                num_draws=int(1e5),
                seed=1)

            approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
                f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
                p_log_prob=p.log_prob,
                q=q,
                num_draws=int(1e5),
                seed=1)

            exact_kl = kullback_leibler.kl_divergence(q, p)

            [approx_kl_, approx_kl_self_normalized_, exact_kl_
             ] = sess.run([approx_kl, approx_kl_self_normalized, exact_kl])

            self.assertAllClose(approx_kl_, exact_kl_, rtol=0.07, atol=0.)

            self.assertAllClose(approx_kl_self_normalized_,
                                exact_kl_,
                                rtol=0.02,
                                atol=0.)
  def test_kl_reverse(self):
    with self.test_session() as sess:

      q = normal_lib.Normal(
          loc=np.ones(6),
          scale=np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0]))

      p = normal_lib.Normal(loc=q.loc + 0.1, scale=q.scale - 0.2)

      approx_kl = cd.monte_carlo_csiszar_f_divergence(
          f=cd.kl_reverse,
          p=p,
          q=q,
          num_draws=int(1e5),
          seed=1)

      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
          f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
          p=p,
          q=q,
          num_draws=int(1e5),
          seed=1)

      exact_kl = kullback_leibler.kl_divergence(q, p)

      [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([
          approx_kl, approx_kl_self_normalized, exact_kl])

      self.assertAllClose(approx_kl_, exact_kl_,
                          rtol=0.07, atol=0.)

      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
                          rtol=0.02, atol=0.)
Exemplo n.º 10
0
  def test_convergence_to_kl_using_sample_form_on_3dim_normal(self):
    # Test that the sample mean KL is the same as analytic when we use samples
    # to estimate every part of the KL divergence ratio.
    vector_shape = (2, 3)
    n_samples = 5000

    with self.test_session():
      q = mvn_diag_lib.MultivariateNormalDiag(
          loc=self._rng.rand(*vector_shape),
          scale_diag=self._rng.rand(*vector_shape))
      p = mvn_diag_lib.MultivariateNormalDiag(
          loc=self._rng.rand(*vector_shape),
          scale_diag=self._rng.rand(*vector_shape))

      # In this case, the log_ratio is the KL.
      sample_kl = -1 * entropy.elbo_ratio(
          log_p=p.log_prob,
          q=q,
          n=n_samples,
          form=entropy.ELBOForms.sample,
          seed=42)
      actual_kl = kullback_leibler_lib.kl_divergence(q, p)

      # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
      # pass.
      self.assertEqual((2,), sample_kl.get_shape())
      self.assertAllClose(actual_kl.eval(), sample_kl.eval(), rtol=0.05)
Exemplo n.º 11
0
  def testGammaGammaKL(self):
    alpha0 = np.array([3.])
    beta0 = np.array([1., 2., 3., 1.5, 2.5, 3.5])

    alpha1 = np.array([0.4])
    beta1 = np.array([0.5, 1., 1.5, 2., 2.5, 3.])

    # Build graph.
    with self.test_session() as sess:
      g0 = gamma_lib.Gamma(concentration=alpha0, rate=beta0)
      g1 = gamma_lib.Gamma(concentration=alpha1, rate=beta1)
      x = g0.sample(int(1e4), seed=0)
      kl_sample = math_ops.reduce_mean(g0.log_prob(x) - g1.log_prob(x), 0)
      kl_actual = kullback_leibler.kl_divergence(g0, g1)

    # Execute graph.
    [kl_sample_, kl_actual_] = sess.run([kl_sample, kl_actual])

    kl_expected = ((alpha0 - alpha1) * special.digamma(alpha0)
                   + special.gammaln(alpha1)
                   - special.gammaln(alpha0)
                   + alpha1 * np.log(beta0)
                   - alpha1 * np.log(beta1)
                   + alpha0 * (beta1 / beta0 - 1.))

    self.assertEqual(beta0.shape, kl_actual.get_shape())
    self.assertAllClose(kl_expected, kl_actual_, atol=0., rtol=1e-6)
    self.assertAllClose(kl_sample_, kl_actual_, atol=0., rtol=1e-2)
Exemplo n.º 12
0
  def test_kl_reverse_multidim(self):

    with self.test_session() as sess:
      d = 5  # Dimension

      p = mvn_full_lib.MultivariateNormalFullCovariance(
          covariance_matrix=self._tridiag(d, diag_value=1, offdiag_value=0.5))

      q = mvn_diag_lib.MultivariateNormalDiag(scale_diag=[0.5]*d)

      approx_kl = cd.monte_carlo_csiszar_f_divergence(
          f=cd.kl_reverse,
          p=p,
          q=q,
          num_draws=int(1e5),
          seed=1)

      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
          f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
          p=p,
          q=q,
          num_draws=int(1e5),
          seed=1)

      exact_kl = kullback_leibler.kl_divergence(q, p)

      [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([
          approx_kl, approx_kl_self_normalized, exact_kl])

      self.assertAllClose(approx_kl_, exact_kl_,
                          rtol=0.02, atol=0.)

      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
                          rtol=0.08, atol=0.)
 def testDefaultVariationalAndPrior(self):
   _, prior, variational, _, log_likelihood = mini_vae()
   elbo = vi.elbo(log_likelihood)
   expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
       variational.distribution, prior)
   with self.test_session() as sess:
     sess.run(variables.global_variables_initializer())
     self.assertAllEqual(*sess.run([expected_elbo, elbo]))
Exemplo n.º 14
0
 def testDefaultVariationalAndPrior(self):
     _, prior, variational, _, log_likelihood = mini_vae()
     elbo = vi.elbo(log_likelihood)
     expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
         variational.distribution, prior)
     with self.test_session() as sess:
         sess.run(variables.global_variables_initializer())
         self.assertAllEqual(*sess.run([expected_elbo, elbo]))
Exemplo n.º 15
0
    def testKLIdentity(self):
        normal1 = normal_lib.Normal(loc=np.float32([-1., 1]),
                                    scale=np.float32([0.1, 0.5]))
        # This is functionally just a wrapper around normal1,
        # and doesn't change any outputs.
        ind1 = independent_lib.Independent(distribution=normal1,
                                           reinterpreted_batch_ndims=0)

        normal2 = normal_lib.Normal(loc=np.float32([-3., 3]),
                                    scale=np.float32([0.3, 0.3]))
        # This is functionally just a wrapper around normal2,
        # and doesn't change any outputs.
        ind2 = independent_lib.Independent(distribution=normal2,
                                           reinterpreted_batch_ndims=0)

        normal_kl = kullback_leibler.kl_divergence(normal1, normal2)
        ind_kl = kullback_leibler.kl_divergence(ind1, ind2)
        self.assertAllClose(self.evaluate(normal_kl), self.evaluate(ind_kl))
Exemplo n.º 16
0
  def testKLScalarToMultivariate(self):
    normal1 = normal_lib.Normal(
        loc=np.float32([-1., 1]),
        scale=np.float32([0.1, 0.5]))
    ind1 = independent_lib.Independent(
        distribution=normal1, reinterpreted_batch_ndims=1)

    normal2 = normal_lib.Normal(
        loc=np.float32([-3., 3]),
        scale=np.float32([0.3, 0.3]))
    ind2 = independent_lib.Independent(
        distribution=normal2, reinterpreted_batch_ndims=1)

    normal_kl = kullback_leibler.kl_divergence(normal1, normal2)
    ind_kl = kullback_leibler.kl_divergence(ind1, ind2)
    self.assertAllClose(
        self.evaluate(math_ops.reduce_sum(normal_kl, axis=-1)),
        self.evaluate(ind_kl))
Exemplo n.º 17
0
  def testKLScalarToMultivariate(self):
    normal1 = normal_lib.Normal(
        loc=np.float32([-1., 1]),
        scale=np.float32([0.1, 0.5]))
    ind1 = independent_lib.Independent(
        distribution=normal1, reinterpreted_batch_ndims=1)

    normal2 = normal_lib.Normal(
        loc=np.float32([-3., 3]),
        scale=np.float32([0.3, 0.3]))
    ind2 = independent_lib.Independent(
        distribution=normal2, reinterpreted_batch_ndims=1)

    normal_kl = kullback_leibler.kl_divergence(normal1, normal2)
    ind_kl = kullback_leibler.kl_divergence(ind1, ind2)
    self.assertAllClose(
        self.evaluate(math_ops.reduce_sum(normal_kl, axis=-1)),
        self.evaluate(ind_kl))
Exemplo n.º 18
0
    def __init__(
            self,
            units,
            activation=None,
            activity_regularizer=None,
            trainable=True,
            kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
            kernel_posterior_tensor_fn=lambda d: d.sample(),
            kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
                loc=dtype.as_numpy_dtype(0.),
                scale=dtype.as_numpy_dtype(1.)),
            kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(
                q, p),
            bias_posterior_fn=layers_util.default_mean_field_normal_fn(
                is_singular=True),
            bias_posterior_tensor_fn=lambda d: d.sample(),
            bias_prior_fn=None,
            bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
            seed=None,
            name=None,
            **kwargs):
        # pylint: disable=g-doc-args
        """Construct layer.

    Args:
      @{args}
    """
        # pylint: enable=g-doc-args
        super(DenseFlipout, self).__init__(
            units=units,
            activation=activation,
            activity_regularizer=activity_regularizer,
            trainable=trainable,
            kernel_posterior_fn=kernel_posterior_fn,
            kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
            kernel_prior_fn=kernel_prior_fn,
            kernel_divergence_fn=kernel_divergence_fn,
            bias_posterior_fn=bias_posterior_fn,
            bias_posterior_tensor_fn=bias_posterior_tensor_fn,
            bias_prior_fn=bias_prior_fn,
            bias_divergence_fn=bias_divergence_fn,
            name=name,
            **kwargs)
        self.seed = seed
Exemplo n.º 19
0
 def testExplicitVariationalAndPrior(self):
     with self.test_session() as sess:
         _, _, variational, _, log_likelihood = mini_vae()
         prior = normal.Normal(loc=3., scale=2.)
         elbo = vi.elbo(log_likelihood,
                        variational_with_prior={variational: prior})
         expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
             variational.distribution, prior)
         sess.run(variables.global_variables_initializer())
         self.assertAllEqual(*sess.run([expected_elbo, elbo]))
Exemplo n.º 20
0
    def __init__(
            self,
            units,
            activation=None,
            activity_regularizer=None,
            trainable=True,
            kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
            kernel_posterior_tensor_fn=lambda d: d.sample(),
            kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
                loc=dtype.as_numpy_dtype(0.),
                scale=dtype.as_numpy_dtype(1.)),
            kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(
                q, p),
            bias_posterior_fn=layers_util.default_mean_field_normal_fn(
                is_singular=True),  # pylint: disable=line-too-long
            bias_posterior_tensor_fn=lambda d: d.sample(),
            bias_prior_fn=None,
            bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
            name=None,
            **kwargs):
        # pylint: disable=g-doc-args
        """Construct layer.

    Args:
      @{args}
    """
        # pylint: enable=g-doc-args
        super(_DenseVariational,
              self).__init__(trainable=trainable,
                             name=name,
                             activity_regularizer=activity_regularizer,
                             **kwargs)
        self.units = units
        self.activation = activation
        self.input_spec = layers_lib.InputSpec(min_ndim=2)
        self.kernel_posterior_fn = kernel_posterior_fn
        self.kernel_posterior_tensor_fn = kernel_posterior_tensor_fn
        self.kernel_prior_fn = kernel_prior_fn
        self.kernel_divergence_fn = kernel_divergence_fn
        self.bias_posterior_fn = bias_posterior_fn
        self.bias_posterior_tensor_fn = bias_posterior_tensor_fn
        self.bias_prior_fn = bias_prior_fn
        self.bias_divergence_fn = bias_divergence_fn
 def testExplicitVariationalAndPrior(self):
   with self.test_session() as sess:
     _, _, variational, _, log_likelihood = mini_vae()
     prior = normal.Normal(loc=3., scale=2.)
     elbo = vi.elbo(
         log_likelihood, variational_with_prior={variational: prior})
     expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
         variational.distribution, prior)
     sess.run(variables.global_variables_initializer())
     self.assertAllEqual(*sess.run([expected_elbo, elbo]))
    def testDomainErrorExceptions(self):
        class MyDistException(normal.Normal):
            pass

        # Register KL to a lambda that spits out the name parameter
        @kullback_leibler.RegisterKL(MyDistException, MyDistException)
        # pylint: disable=unused-argument,unused-variable
        def _kl(a, b, name=None):
            return array_ops.identity([float("nan")])

        # pylint: disable=unused-argument,unused-variable

        with self.test_session():
            a = MyDistException(loc=0.0, scale=1.0)
            kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
            with self.assertRaisesOpError(
                    "KL calculation between .* and .* returned NaN values"):
                kl.eval()
            kl_ok = kullback_leibler.kl_divergence(a, a)
            self.assertAllEqual([float("nan")], kl_ok.eval())
Exemplo n.º 23
0
    def testRegistration(self):
        class MyDist(normal.Normal):
            pass

        # Register KL to a lambda that spits out the name parameter
        @kullback_leibler.RegisterKL(MyDist, MyDist)
        def _kl(a, b, name=None):  # pylint: disable=unused-argument,unused-variable
            return name

        a = MyDist(loc=0.0, scale=1.0)
        self.assertEqual("OK", kullback_leibler.kl_divergence(a, a, name="OK"))
Exemplo n.º 24
0
  def testKLMultivariateToMultivariate(self):
    # (1, 1, 2) batch of MVNDiag
    mvn1 = mvn_diag_lib.MultivariateNormalDiag(
        loc=np.float32([[[[-1., 1, 3.], [2., 4., 3.]]]]),
        scale_diag=np.float32([[[0.2, 0.1, 5.], [2., 3., 4.]]]))
    ind1 = independent_lib.Independent(
        distribution=mvn1, reinterpreted_batch_ndims=2)

    # (1, 1, 2) batch of MVNDiag
    mvn2 = mvn_diag_lib.MultivariateNormalDiag(
        loc=np.float32([[[[-2., 3, 2.], [1., 3., 2.]]]]),
        scale_diag=np.float32([[[0.1, 0.5, 3.], [1., 2., 1.]]]))

    ind2 = independent_lib.Independent(
        distribution=mvn2, reinterpreted_batch_ndims=2)

    mvn_kl = kullback_leibler.kl_divergence(mvn1, mvn2)
    ind_kl = kullback_leibler.kl_divergence(ind1, ind2)
    self.assertAllClose(
        self.evaluate(math_ops.reduce_sum(mvn_kl, axis=[-1, -2])),
        self.evaluate(ind_kl))
Exemplo n.º 25
0
    def testKLMultivariateToMultivariate(self):
        # (1, 1, 2) batch of MVNDiag
        mvn1 = mvn_diag_lib.MultivariateNormalDiag(
            loc=np.float32([[[[-1., 1, 3.], [2., 4., 3.]]]]),
            scale_diag=np.float32([[[0.2, 0.1, 5.], [2., 3., 4.]]]))
        ind1 = independent_lib.Independent(distribution=mvn1,
                                           reinterpreted_batch_ndims=2)

        # (1, 1, 2) batch of MVNDiag
        mvn2 = mvn_diag_lib.MultivariateNormalDiag(
            loc=np.float32([[[[-2., 3, 2.], [1., 3., 2.]]]]),
            scale_diag=np.float32([[[0.1, 0.5, 3.], [1., 2., 1.]]]))

        ind2 = independent_lib.Independent(distribution=mvn2,
                                           reinterpreted_batch_ndims=2)

        mvn_kl = kullback_leibler.kl_divergence(mvn1, mvn2)
        ind_kl = kullback_leibler.kl_divergence(ind1, ind2)
        self.assertAllClose(
            self.evaluate(math_ops.reduce_sum(mvn_kl, axis=[-1, -2])),
            self.evaluate(ind_kl))
Exemplo n.º 26
0
  def testRegistration(self):

    class MyDist(normal.Normal):
      pass

    # Register KL to a lambda that spits out the name parameter
    @kullback_leibler.RegisterKL(MyDist, MyDist)
    def _kl(a, b, name=None):  # pylint: disable=unused-argument,unused-variable
      return name

    a = MyDist(loc=0.0, scale=1.0)
    self.assertEqual("OK", kullback_leibler.kl_divergence(a, a, name="OK"))
Exemplo n.º 27
0
  def testKLIdentity(self):
    normal1 = normal_lib.Normal(
        loc=np.float32([-1., 1]),
        scale=np.float32([0.1, 0.5]))
    # This is functionally just a wrapper around normal1,
    # and doesn't change any outputs.
    ind1 = independent_lib.Independent(
        distribution=normal1, reinterpreted_batch_ndims=0)

    normal2 = normal_lib.Normal(
        loc=np.float32([-3., 3]),
        scale=np.float32([0.3, 0.3]))
    # This is functionally just a wrapper around normal2,
    # and doesn't change any outputs.
    ind2 = independent_lib.Independent(
        distribution=normal2, reinterpreted_batch_ndims=0)

    normal_kl = kullback_leibler.kl_divergence(normal1, normal2)
    ind_kl = kullback_leibler.kl_divergence(ind1, ind2)
    self.assertAllClose(
        self.evaluate(normal_kl), self.evaluate(ind_kl))
Exemplo n.º 28
0
    def testCategoricalCategoricalKL(self):
        def np_softmax(logits):
            exp_logits = np.exp(logits)
            return exp_logits / exp_logits.sum(axis=-1, keepdims=True)

        with self.test_session() as sess:
            for categories in [2, 10]:
                for batch_size in [1, 2]:
                    p_logits = self._rng.random_sample(
                        (batch_size, categories))
                    q_logits = self._rng.random_sample(
                        (batch_size, categories))
                    p = onehot_categorical.OneHotCategorical(logits=p_logits)
                    q = onehot_categorical.OneHotCategorical(logits=q_logits)
                    prob_p = np_softmax(p_logits)
                    prob_q = np_softmax(q_logits)
                    kl_expected = np.sum(prob_p *
                                         (np.log(prob_p) - np.log(prob_q)),
                                         axis=-1)

                    kl_actual = kullback_leibler.kl_divergence(p, q)
                    kl_same = kullback_leibler.kl_divergence(p, p)
                    x = p.sample(int(2e4), seed=0)
                    x = math_ops.cast(x, dtype=dtypes.float32)
                    # Compute empirical KL(p||q).
                    kl_sample = math_ops.reduce_mean(
                        p.log_prob(x) - q.log_prob(x), 0)

                    [kl_sample_, kl_actual_,
                     kl_same_] = sess.run([kl_sample, kl_actual, kl_same])
                    self.assertEqual(kl_actual.get_shape(), (batch_size, ))
                    self.assertAllClose(kl_same_, np.zeros_like(kl_expected))
                    self.assertAllClose(kl_actual_,
                                        kl_expected,
                                        atol=0.,
                                        rtol=1e-6)
                    self.assertAllClose(kl_sample_,
                                        kl_expected,
                                        atol=1e-2,
                                        rtol=0.)
    def testIndirectRegistration(self):
        class Sub1(normal.Normal):
            pass

        class Sub2(normal.Normal):
            pass

        class Sub11(Sub1):
            pass

        # pylint: disable=unused-argument,unused-variable
        @kullback_leibler.RegisterKL(Sub1, Sub1)
        def _kl11(a, b, name=None):
            return "sub1-1"

        @kullback_leibler.RegisterKL(Sub1, Sub2)
        def _kl12(a, b, name=None):
            return "sub1-2"

        @kullback_leibler.RegisterKL(Sub2, Sub1)
        def _kl21(a, b, name=None):
            return "sub2-1"

        # pylint: enable=unused-argument,unused_variable

        sub1 = Sub1(loc=0.0, scale=1.0)
        sub2 = Sub2(loc=0.0, scale=1.0)
        sub11 = Sub11(loc=0.0, scale=1.0)

        self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub1))
        self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub1, sub2))
        self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub1))
        self.assertEqual("sub1-1",
                         kullback_leibler.kl_divergence(sub11, sub11))
        self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
        self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
        self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
        self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
        self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub11))
        self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub11))
Exemplo n.º 30
0
  def testIndirectRegistration(self):

    class Sub1(normal.Normal):
      pass

    class Sub2(normal.Normal):
      pass

    class Sub11(Sub1):
      pass

    # pylint: disable=unused-argument,unused-variable
    @kullback_leibler.RegisterKL(Sub1, Sub1)
    def _kl11(a, b, name=None):
      return "sub1-1"

    @kullback_leibler.RegisterKL(Sub1, Sub2)
    def _kl12(a, b, name=None):
      return "sub1-2"

    @kullback_leibler.RegisterKL(Sub2, Sub1)
    def _kl21(a, b, name=None):
      return "sub2-1"

    # pylint: enable=unused-argument,unused_variable

    sub1 = Sub1(loc=0.0, scale=1.0)
    sub2 = Sub2(loc=0.0, scale=1.0)
    sub11 = Sub11(loc=0.0, scale=1.0)

    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub1))
    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub1, sub2))
    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub1))
    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub11))
    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub11))
    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub11))
Exemplo n.º 31
0
    def testKLRaises(self):
        ind1 = independent_lib.Independent(distribution=normal_lib.Normal(
            loc=np.float32([-1., 1]), scale=np.float32([0.1, 0.5])),
                                           reinterpreted_batch_ndims=1)
        ind2 = independent_lib.Independent(distribution=normal_lib.Normal(
            loc=np.float32(-1), scale=np.float32(0.5)),
                                           reinterpreted_batch_ndims=0)

        with self.assertRaisesRegexp(ValueError, "Event shapes do not match"):
            kullback_leibler.kl_divergence(ind1, ind2)

        ind1 = independent_lib.Independent(distribution=normal_lib.Normal(
            loc=np.float32([-1., 1]), scale=np.float32([0.1, 0.5])),
                                           reinterpreted_batch_ndims=1)
        ind2 = independent_lib.Independent(
            distribution=mvn_diag_lib.MultivariateNormalDiag(
                loc=np.float32([-1., 1]), scale_diag=np.float32([0.1, 0.5])),
            reinterpreted_batch_ndims=0)

        with self.assertRaisesRegexp(NotImplementedError,
                                     "different event shapes"):
            kullback_leibler.kl_divergence(ind1, ind2)
 def __init__(
     self,
     units,
     activation=None,
     activity_regularizer=None,
     trainable=True,
     kernel_use_local_reparameterization=True,
     kernel_posterior_fn=default_mean_field_normal_fn(),
     kernel_posterior_tensor_fn=lambda d: d.sample(),
     kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
         loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
     kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
     bias_posterior_fn=default_mean_field_normal_fn(is_singular=True),
     bias_posterior_tensor_fn=lambda d: d.sample(),
     bias_prior_fn=None,
     bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
     name=None,
     **kwargs):
   super(DenseVariational, self).__init__(
       trainable=trainable,
       name=name,
       activity_regularizer=activity_regularizer,
       **kwargs)
   self._units = units
   self._activation = activation
   self._input_spec = layers_lib.InputSpec(min_ndim=2)
   self._kernel_use_local_reparameterization = (
       kernel_use_local_reparameterization)
   self._kernel = VariationalKernelParameter(
       kernel_posterior_fn,
       kernel_posterior_tensor_fn,
       kernel_prior_fn,
       kernel_divergence_fn)
   self._bias = VariationalParameter(
       bias_posterior_fn,
       bias_posterior_tensor_fn,
       bias_prior_fn,
       bias_divergence_fn)
Exemplo n.º 33
0
    def testBetaBetaKL(self):
        with self.test_session() as sess:
            for shape in [(10, ), (4, 5)]:
                a1 = 6.0 * np.random.random(size=shape) + 1e-4
                b1 = 6.0 * np.random.random(size=shape) + 1e-4
                a2 = 6.0 * np.random.random(size=shape) + 1e-4
                b2 = 6.0 * np.random.random(size=shape) + 1e-4
                # Take inverse softplus of values to test BetaWithSoftplusConcentration
                a1_sp = np.log(np.exp(a1) - 1.0)
                b1_sp = np.log(np.exp(b1) - 1.0)
                a2_sp = np.log(np.exp(a2) - 1.0)
                b2_sp = np.log(np.exp(b2) - 1.0)

                d1 = beta_lib.Beta(concentration1=a1, concentration0=b1)
                d2 = beta_lib.Beta(concentration1=a2, concentration0=b2)
                d1_sp = beta_lib.BetaWithSoftplusConcentration(
                    concentration1=a1_sp, concentration0=b1_sp)
                d2_sp = beta_lib.BetaWithSoftplusConcentration(
                    concentration1=a2_sp, concentration0=b2_sp)

                if not special:
                    return
                kl_expected = (special.betaln(a2, b2) -
                               special.betaln(a1, b1) +
                               (a1 - a2) * special.digamma(a1) +
                               (b1 - b2) * special.digamma(b1) +
                               (a2 - a1 + b2 - b1) * special.digamma(a1 + b1))

                for dist1 in [d1, d1_sp]:
                    for dist2 in [d2, d2_sp]:
                        kl = kullback_leibler.kl_divergence(dist1, dist2)
                        kl_val = sess.run(kl)
                        self.assertEqual(kl.get_shape(), shape)
                        self.assertAllClose(kl_val, kl_expected)

                # Make sure KL(d1||d1) is 0
                kl_same = sess.run(kullback_leibler.kl_divergence(d1, d1))
                self.assertAllClose(kl_same, np.zeros_like(kl_expected))
Exemplo n.º 34
0
 def __init__(
         self,
         units,
         activation=None,
         activity_regularizer=None,
         trainable=True,
         kernel_use_local_reparameterization=True,
         kernel_posterior_fn=default_mean_field_normal_fn(),
         kernel_posterior_tensor_fn=lambda d: d.sample(),
         kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
             loc=dtype.as_numpy_dtype(0.),
             scale=dtype.as_numpy_dtype(1.)),
         kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(
             q, p),
         bias_posterior_fn=default_mean_field_normal_fn(is_singular=True),
         bias_posterior_tensor_fn=lambda d: d.sample(),
         bias_prior_fn=None,
         bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
         name=None,
         **kwargs):
     super(DenseVariational,
           self).__init__(trainable=trainable,
                          name=name,
                          activity_regularizer=activity_regularizer,
                          **kwargs)
     self._units = units
     self._activation = activation
     self._input_spec = layers_lib.InputSpec(min_ndim=2)
     self._kernel_use_local_reparameterization = (
         kernel_use_local_reparameterization)
     self._kernel = VariationalKernelParameter(kernel_posterior_fn,
                                               kernel_posterior_tensor_fn,
                                               kernel_prior_fn,
                                               kernel_divergence_fn)
     self._bias = VariationalParameter(bias_posterior_fn,
                                       bias_posterior_tensor_fn,
                                       bias_prior_fn, bias_divergence_fn)
Exemplo n.º 35
0
  def testBernoulliBernoulliKL(self):
    batch_size = 6
    a_p = np.array([0.5] * batch_size, dtype=np.float32)
    b_p = np.array([0.4] * batch_size, dtype=np.float32)

    a = bernoulli.Bernoulli(probs=a_p)
    b = bernoulli.Bernoulli(probs=b_p)

    kl = kullback_leibler.kl_divergence(a, b)
    kl_val = self.evaluate(kl)

    kl_expected = (a_p * np.log(a_p / b_p) + (1. - a_p) * np.log(
        (1. - a_p) / (1. - b_p)))

    self.assertEqual(kl.get_shape(), (batch_size,))
    self.assertAllClose(kl_val, kl_expected)
    def testBernoulliBernoulliKL(self):
        batch_size = 6
        a_p = np.array([0.5] * batch_size, dtype=np.float32)
        b_p = np.array([0.4] * batch_size, dtype=np.float32)

        a = bernoulli.Bernoulli(probs=a_p)
        b = bernoulli.Bernoulli(probs=b_p)

        kl = kullback_leibler.kl_divergence(a, b)
        kl_val = self.evaluate(kl)

        kl_expected = (a_p * np.log(a_p / b_p) + (1. - a_p) * np.log(
            (1. - a_p) / (1. - b_p)))

        self.assertEqual(kl.get_shape(), (batch_size, ))
        self.assertAllClose(kl_val, kl_expected)
Exemplo n.º 37
0
    def test_docstring_example_normal(self):
        with self.cached_session() as sess:
            num_draws = int(1e5)
            mu_p = constant_op.constant(0.)
            mu_q = constant_op.constant(1.)
            p = normal_lib.Normal(loc=mu_p, scale=1.)
            q = normal_lib.Normal(loc=mu_q, scale=2.)
            exact_kl_normal_normal = kullback_leibler.kl_divergence(p, q)
            approx_kl_normal_normal = monte_carlo_lib.expectation(
                f=lambda x: p.log_prob(x) - q.log_prob(x),
                samples=p.sample(num_draws, seed=42),
                log_prob=p.log_prob,
                use_reparametrization=(p.reparameterization_type ==
                                       distribution_lib.FULLY_REPARAMETERIZED))
            [exact_kl_normal_normal_, approx_kl_normal_normal_
             ] = sess.run([exact_kl_normal_normal, approx_kl_normal_normal])
            self.assertEqual(
                True, p.reparameterization_type ==
                distribution_lib.FULLY_REPARAMETERIZED)
            self.assertAllClose(exact_kl_normal_normal_,
                                approx_kl_normal_normal_,
                                rtol=0.01,
                                atol=0.)

            # Compare gradients. (Not present in `docstring`.)
            gradp = lambda fp: gradients_impl.gradients(fp, mu_p)[0]
            gradq = lambda fq: gradients_impl.gradients(fq, mu_q)[0]
            [
                gradp_exact_kl_normal_normal_,
                gradq_exact_kl_normal_normal_,
                gradp_approx_kl_normal_normal_,
                gradq_approx_kl_normal_normal_,
            ] = sess.run([
                gradp(exact_kl_normal_normal),
                gradq(exact_kl_normal_normal),
                gradp(approx_kl_normal_normal),
                gradq(approx_kl_normal_normal),
            ])
            self.assertAllClose(gradp_exact_kl_normal_normal_,
                                gradp_approx_kl_normal_normal_,
                                rtol=0.01,
                                atol=0.)
            self.assertAllClose(gradq_exact_kl_normal_normal_,
                                gradq_approx_kl_normal_normal_,
                                rtol=0.01,
                                atol=0.)
Exemplo n.º 38
0
  def testNormalNormalKL(self):
    batch_size = 6
    mu_a = np.array([3.0] * batch_size)
    sigma_a = np.array([1.0, 2.0, 3.0, 1.5, 2.5, 3.5])
    mu_b = np.array([-3.0] * batch_size)
    sigma_b = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0])

    n_a = normal_lib.Normal(loc=mu_a, scale=sigma_a)
    n_b = normal_lib.Normal(loc=mu_b, scale=sigma_b)

    kl = kullback_leibler.kl_divergence(n_a, n_b)
    kl_val = self.evaluate(kl)

    kl_expected = ((mu_a - mu_b)**2 / (2 * sigma_b**2) + 0.5 * (
        (sigma_a**2 / sigma_b**2) - 1 - 2 * np.log(sigma_a / sigma_b)))

    self.assertEqual(kl.get_shape(), (batch_size,))
    self.assertAllClose(kl_val, kl_expected)
Exemplo n.º 39
0
    def testNormalNormalKL(self):
        batch_size = 6
        mu_a = np.array([3.0] * batch_size)
        sigma_a = np.array([1.0, 2.0, 3.0, 1.5, 2.5, 3.5])
        mu_b = np.array([-3.0] * batch_size)
        sigma_b = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0])

        n_a = normal_lib.Normal(loc=mu_a, scale=sigma_a)
        n_b = normal_lib.Normal(loc=mu_b, scale=sigma_b)

        kl = kullback_leibler.kl_divergence(n_a, n_b)
        kl_val = self.evaluate(kl)

        kl_expected = ((mu_a - mu_b)**2 / (2 * sigma_b**2) + 0.5 * (
            (sigma_a**2 / sigma_b**2) - 1 - 2 * np.log(sigma_a / sigma_b)))

        self.assertEqual(kl.get_shape(), (batch_size, ))
        self.assertAllClose(kl_val, kl_expected)
Exemplo n.º 40
0
  def test_docstring_example_gamma(self):
    with self.test_session() as sess:
      num_draws = int(1e5)
      concentration_p = constant_op.constant(1.)
      concentration_q = constant_op.constant(2.)
      p = gamma_lib.Gamma(concentration=concentration_p, rate=1.)
      q = gamma_lib.Gamma(concentration=concentration_q, rate=3.)
      approx_kl_gamma_gamma = monte_carlo_lib.expectation(
          f=lambda x: p.log_prob(x) - q.log_prob(x),
          samples=p.sample(num_draws, seed=42),
          log_prob=p.log_prob,
          use_reparametrization=(p.reparameterization_type
                                 == distribution_lib.FULLY_REPARAMETERIZED))
      exact_kl_gamma_gamma = kullback_leibler.kl_divergence(p, q)
      [exact_kl_gamma_gamma_, approx_kl_gamma_gamma_] = sess.run([
          exact_kl_gamma_gamma, approx_kl_gamma_gamma])
      self.assertEqual(
          False,
          p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED)
      self.assertAllClose(exact_kl_gamma_gamma_, approx_kl_gamma_gamma_,
                          rtol=0.01, atol=0.)

      # Compare gradients. (Not present in `docstring`.)
      gradp = lambda fp: gradients_impl.gradients(fp, concentration_p)[0]
      gradq = lambda fq: gradients_impl.gradients(fq, concentration_q)[0]
      [
          gradp_exact_kl_gamma_gamma_,
          gradq_exact_kl_gamma_gamma_,
          gradp_approx_kl_gamma_gamma_,
          gradq_approx_kl_gamma_gamma_,
      ] = sess.run([
          gradp(exact_kl_gamma_gamma),
          gradq(exact_kl_gamma_gamma),
          gradp(approx_kl_gamma_gamma),
          gradq(approx_kl_gamma_gamma),
      ])
      # Notice that variance (i.e., `rtol`) is higher when using score-trick.
      self.assertAllClose(gradp_exact_kl_gamma_gamma_,
                          gradp_approx_kl_gamma_gamma_,
                          rtol=0.05, atol=0.)
      self.assertAllClose(gradq_exact_kl_gamma_gamma_,
                          gradq_approx_kl_gamma_gamma_,
                          rtol=0.03, atol=0.)
Exemplo n.º 41
0
  def test_docstring_example_gamma(self):
    with self.test_session() as sess:
      num_draws = int(1e5)
      concentration_p = constant_op.constant(1.)
      concentration_q = constant_op.constant(2.)
      p = gamma_lib.Gamma(concentration=concentration_p, rate=1.)
      q = gamma_lib.Gamma(concentration=concentration_q, rate=3.)
      approx_kl_gamma_gamma = monte_carlo_lib.expectation(
          f=lambda x: p.log_prob(x) - q.log_prob(x),
          samples=p.sample(num_draws, seed=42),
          log_prob=p.log_prob,
          use_reparametrization=(p.reparameterization_type
                                 == distribution_lib.FULLY_REPARAMETERIZED))
      exact_kl_gamma_gamma = kullback_leibler.kl_divergence(p, q)
      [exact_kl_gamma_gamma_, approx_kl_gamma_gamma_] = sess.run([
          exact_kl_gamma_gamma, approx_kl_gamma_gamma])
      self.assertEqual(
          False,
          p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED)
      self.assertAllClose(exact_kl_gamma_gamma_, approx_kl_gamma_gamma_,
                          rtol=0.01, atol=0.)

      # Compare gradients. (Not present in `docstring`.)
      gradp = lambda fp: gradients_impl.gradients(fp, concentration_p)[0]
      gradq = lambda fq: gradients_impl.gradients(fq, concentration_q)[0]
      [
          gradp_exact_kl_gamma_gamma_,
          gradq_exact_kl_gamma_gamma_,
          gradp_approx_kl_gamma_gamma_,
          gradq_approx_kl_gamma_gamma_,
      ] = sess.run([
          gradp(exact_kl_gamma_gamma),
          gradq(exact_kl_gamma_gamma),
          gradp(approx_kl_gamma_gamma),
          gradq(approx_kl_gamma_gamma),
      ])
      # Notice that variance (i.e., `rtol`) is higher when using score-trick.
      self.assertAllClose(gradp_exact_kl_gamma_gamma_,
                          gradp_approx_kl_gamma_gamma_,
                          rtol=0.05, atol=0.)
      self.assertAllClose(gradq_exact_kl_gamma_gamma_,
                          gradq_approx_kl_gamma_gamma_,
                          rtol=0.03, atol=0.)
Exemplo n.º 42
0
  def test_docstring_example_normal(self):
    with self.test_session() as sess:
      num_draws = int(1e5)
      mu_p = constant_op.constant(0.)
      mu_q = constant_op.constant(1.)
      p = normal_lib.Normal(loc=mu_p, scale=1.)
      q = normal_lib.Normal(loc=mu_q, scale=2.)
      exact_kl_normal_normal = kullback_leibler.kl_divergence(p, q)
      approx_kl_normal_normal = monte_carlo_lib.expectation(
          f=lambda x: p.log_prob(x) - q.log_prob(x),
          samples=p.sample(num_draws, seed=42),
          log_prob=p.log_prob,
          use_reparametrization=(p.reparameterization_type
                                 == distribution_lib.FULLY_REPARAMETERIZED))
      [exact_kl_normal_normal_, approx_kl_normal_normal_] = sess.run([
          exact_kl_normal_normal, approx_kl_normal_normal])
      self.assertEqual(
          True,
          p.reparameterization_type == distribution_lib.FULLY_REPARAMETERIZED)
      self.assertAllClose(exact_kl_normal_normal_, approx_kl_normal_normal_,
                          rtol=0.01, atol=0.)

      # Compare gradients. (Not present in `docstring`.)
      gradp = lambda fp: gradients_impl.gradients(fp, mu_p)[0]
      gradq = lambda fq: gradients_impl.gradients(fq, mu_q)[0]
      [
          gradp_exact_kl_normal_normal_,
          gradq_exact_kl_normal_normal_,
          gradp_approx_kl_normal_normal_,
          gradq_approx_kl_normal_normal_,
      ] = sess.run([
          gradp(exact_kl_normal_normal),
          gradq(exact_kl_normal_normal),
          gradp(approx_kl_normal_normal),
          gradq(approx_kl_normal_normal),
      ])
      self.assertAllClose(gradp_exact_kl_normal_normal_,
                          gradp_approx_kl_normal_normal_,
                          rtol=0.01, atol=0.)
      self.assertAllClose(gradq_exact_kl_normal_normal_,
                          gradq_approx_kl_normal_normal_,
                          rtol=0.01, atol=0.)
    def test_kl_forward_multidim(self):

        with self.test_session() as sess:
            d = 5  # Dimension

            p = mvn_full_lib.MultivariateNormalFullCovariance(
                covariance_matrix=self._tridiag(
                    d, diag_value=1, offdiag_value=0.5))

            # Variance is very high when approximating Forward KL, so we make
            # scale_diag larger than in test_kl_reverse_multidim. This ensures q
            # "covers" p and thus Var_q[p/q] is smaller.
            q = mvn_diag_lib.MultivariateNormalDiag(scale_diag=[1.] * d)

            approx_kl = cd.monte_carlo_csiszar_f_divergence(f=cd.kl_forward,
                                                            p=p,
                                                            q=q,
                                                            num_draws=int(1e5),
                                                            seed=1)

            approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
                f=lambda logu: cd.kl_forward(logu, self_normalized=True),
                p=p,
                q=q,
                num_draws=int(1e5),
                seed=1)

            exact_kl = kullback_leibler.kl_divergence(p, q)

            [approx_kl_, approx_kl_self_normalized_, exact_kl_
             ] = sess.run([approx_kl, approx_kl_self_normalized, exact_kl])

            self.assertAllClose(approx_kl_, exact_kl_, rtol=0.06, atol=0.)

            self.assertAllClose(approx_kl_self_normalized_,
                                exact_kl_,
                                rtol=0.05,
                                atol=0.)
Exemplo n.º 44
0
  def test_kl_forward_multidim(self):

    with self.test_session() as sess:
      d = 5  # Dimension

      p = mvn_full_lib.MultivariateNormalFullCovariance(
          covariance_matrix=self._tridiag(d, diag_value=1, offdiag_value=0.5))

      # Variance is very high when approximating Forward KL, so we make
      # scale_diag larger than in test_kl_reverse_multidim. This ensures q
      # "covers" p and thus Var_q[p/q] is smaller.
      q = mvn_diag_lib.MultivariateNormalDiag(scale_diag=[1.]*d)

      approx_kl = cd.monte_carlo_csiszar_f_divergence(
          f=cd.kl_forward,
          p=p,
          q=q,
          num_draws=int(1e5),
          seed=1)

      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
          f=lambda logu: cd.kl_forward(logu, self_normalized=True),
          p=p,
          q=q,
          num_draws=int(1e5),
          seed=1)

      exact_kl = kullback_leibler.kl_divergence(p, q)

      [approx_kl_, approx_kl_self_normalized_, exact_kl_] = sess.run([
          approx_kl, approx_kl_self_normalized, exact_kl])

      self.assertAllClose(approx_kl_, exact_kl_,
                          rtol=0.06, atol=0.)

      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
                          rtol=0.05, atol=0.)
def _elbo(form, log_likelihood, log_joint, variational_with_prior,
          keep_batch_dim):
  """Internal implementation of ELBO. Users should use `elbo`.

  Args:
    form: ELBOForms constant. Controls how the ELBO is computed.
    log_likelihood: `Tensor` log p(x|Z).
    log_joint: `Tensor` log p(x, Z).
    variational_with_prior: `dict<StochasticTensor, Distribution>`, varational
      distributions to prior distributions.
    keep_batch_dim: bool. Whether to keep the batch dimension when reducing
      the entropy/KL.

  Returns:
    ELBO `Tensor` with same shape and dtype as `log_likelihood`/`log_joint`.
  """
  ELBOForms.check_form(form)

  # Order of preference
  # 1. Analytic KL: log_likelihood - KL(q||p)
  # 2. Analytic entropy: log_likelihood + log p(Z) + H[q], or log_joint + H[q]
  # 3. Sample: log_likelihood - (log q(Z) - log p(Z)) =
  #            log_likelihood + log p(Z) - log q(Z), or log_joint - q(Z)

  def _reduce(val):
    if keep_batch_dim:
      return val
    else:
      return math_ops.reduce_sum(val)

  kl_terms = []
  entropy_terms = []
  prior_terms = []
  for q, z, p in [(qz.distribution, qz.value(), pz)
                  for qz, pz in variational_with_prior.items()]:
    # Analytic KL
    kl = None
    if log_joint is None and form in {ELBOForms.default, ELBOForms.analytic_kl}:
      try:
        kl = kullback_leibler.kl_divergence(q, p)
        logging.info("Using analytic KL between q:%s, p:%s", q, p)
      except NotImplementedError as e:
        if form == ELBOForms.analytic_kl:
          raise e
    if kl is not None:
      kl_terms.append(-1. * _reduce(kl))
      continue

    # Analytic entropy
    entropy = None
    if form in {ELBOForms.default, ELBOForms.analytic_entropy}:
      try:
        entropy = q.entropy()
        logging.info("Using analytic entropy for q:%s", q)
      except NotImplementedError as e:
        if form == ELBOForms.analytic_entropy:
          raise e
    if entropy is not None:
      entropy_terms.append(_reduce(entropy))
      if log_likelihood is not None:
        prior = p.log_prob(z)
        prior_terms.append(_reduce(prior))
      continue

    # Sample
    if form in {ELBOForms.default, ELBOForms.sample}:
      entropy = -q.log_prob(z)
      entropy_terms.append(_reduce(entropy))
      if log_likelihood is not None:
        prior = p.log_prob(z)
        prior_terms.append(_reduce(prior))

  first_term = log_joint if log_joint is not None else log_likelihood
  return sum([first_term] + kl_terms + entropy_terms + prior_terms)
Exemplo n.º 46
0
def dense_flipout(
        inputs,
        units,
        activation=None,
        activity_regularizer=None,
        trainable=True,
        kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
        kernel_posterior_tensor_fn=lambda d: d.sample(),
        kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
            loc=dtype.as_numpy_dtype(0.),
            scale=dtype.as_numpy_dtype(1.)),
        kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
        bias_posterior_fn=layers_util.default_mean_field_normal_fn(
            is_singular=True),
        bias_posterior_tensor_fn=lambda d: d.sample(),
        bias_prior_fn=None,
        bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
        seed=None,
        name=None,
        reuse=None):
    # pylint: disable=g-doc-args
    """Densely-connected layer with Flipout estimator.

  This layer implements the Bayesian variational inference analogue to
  a dense layer by assuming the `kernel` and/or the `bias` are drawn
  from distributions. By default, the layer implements a stochastic
  forward pass via sampling from the kernel and bias posteriors,

  ```none
  kernel, bias ~ posterior
  outputs = activation(matmul(inputs, kernel) + bias)
  ```

  It uses the Flipout estimator [1], which performs a Monte Carlo
  approximation of the distribution integrating over the `kernel` and
  `bias`. Flipout uses roughly twice as many floating point operations
  as the reparameterization estimator but has the advantage of
  significantly lower variance.

  The arguments permit separate specification of the surrogate posterior
  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
  distributions.

  Args:
    inputs: Tensor input.
    @{args}

  Returns:
    output: `Tensor` representing a the affine transformed input under a random
      draw from the surrogate posterior distribution.

  #### Examples

  We illustrate a Bayesian neural network with [variational inference](
  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
  assuming a dataset of `features` and `labels`.

  ```python
  tfp = tf.contrib.bayesflow

  net = tfp.layers.dense_flipout(
      features, 512, activation=tf.nn.relu)
  logits = tfp.layers.dense_flipout(net, 10)
  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
      labels=labels, logits=logits)
  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
  loss = neg_log_likelihood + kl
  train_op = tf.train.AdamOptimizer().minimize(loss)
  ```

  It uses the Flipout gradient estimator to minimize the
  Kullback-Leibler divergence up to a constant, also known as the
  negative Evidence Lower Bound. It consists of the sum of two terms:
  the expected negative log-likelihood, which we approximate via
  Monte Carlo; and the KL divergence, which is added via regularizer
  terms which are arguments to the layer.

  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
        Mini-Batches."
        Anonymous. OpenReview, 2017.
        https://openreview.net/forum?id=rJnpifWAb
  """
    # pylint: enable=g-doc-args
    layer = DenseFlipout(units,
                         activation=activation,
                         activity_regularizer=activity_regularizer,
                         trainable=trainable,
                         kernel_posterior_fn=kernel_posterior_fn,
                         kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
                         kernel_prior_fn=kernel_prior_fn,
                         kernel_divergence_fn=kernel_divergence_fn,
                         bias_posterior_fn=bias_posterior_fn,
                         bias_posterior_tensor_fn=bias_posterior_tensor_fn,
                         bias_prior_fn=bias_prior_fn,
                         bias_divergence_fn=bias_divergence_fn,
                         seed=seed,
                         name=name,
                         dtype=inputs.dtype.base_dtype,
                         _scope=name,
                         _reuse=reuse)
    return layer.apply(inputs)
Exemplo n.º 47
0
 def _kl_divergence(self, other):
     return kullback_leibler.kl_divergence(
         self, other, allow_nan_stats=self.allow_nan_stats)
Exemplo n.º 48
0
    def _build_loss(self, results, features, labels):
        """Creates the loss operation

        Returns:
             tuple `(losses, loss)`:
                `losses` are the per-batch losses.
                `loss` is a single scalar tensor to minimize.
        """
        action = labels['action']
        discount_reward = labels['discount_reward']
        dist_values = labels['dist_values']
        tangents = labels.get('tangents')
        theta = labels.get('theta')

        old_distribution = self._build_distribution(values=dist_values)
        log_probs = self._graph_results.distribution.log_prob(action)
        old_log_probs = old_distribution.log_prob(action)

        self._losses = tf.multiply(x=tf.exp(log_probs - old_log_probs),
                                   y=discount_reward)
        self._surrogate_loss = -tf.reduce_mean(  # pylint: disable=invalid-unary-operand-type
            self._losses,
            axis=0,
            name='surrogate_loss')
        entropy = self._graph_results.distribution.entropy()
        self._entropy_loss = tf.reduce_mean(entropy, name='entropy_loss')
        kl_divergence_value = kl_divergence(self._graph_results.distribution,
                                            old_distribution)
        self._kl_loss = tf.reduce_mean(kl_divergence_value, name='kl_loss')

        if self.is_continuous:
            dist_values_fixed = tf.stop_gradient(
                tf.concat(values=[
                    self._graph_results.distribution.loc,
                    self._graph_results.distribution.scale
                ],
                          axis=0))
        else:
            dist_values_fixed = tf.stop_gradient(
                self._graph_results.distribution.logits)
        distribution_1_fixed = self._build_distribution(
            values=dist_values_fixed)
        kl_divergence_1_fixed = kl_divergence(distribution_1_fixed,
                                              self._graph_results.distribution)
        self._kl_loss_1_fixed = tf.reduce_mean(kl_divergence_1_fixed,
                                               name='kl_loss_1_fixed')

        variables = list(tf.trainable_variables())
        self._loss = self._surrogate_loss
        self._grads_and_vars, self._policy_gradient = self.get_vars_grads(
            [self._surrogate_loss], variables)

        offset = 0
        list_tangents = []
        list_assigns = []
        for variable in variables:
            shape = get_shape(variable)
            size = np.prod(shape)
            list_tangents.append(
                tf.reshape(tangents[offset:offset + size], shape))
            list_assigns.append(
                tf.assign(variable,
                          tf.reshape(theta[offset:offset + size], shape)))
            offset += size

        gradients = tf.gradients(self._kl_loss_1_fixed, variables)
        gradient_vector_product = [
            tf.reduce_sum(g * t) for (g, t) in zip(gradients, list_tangents)
        ]
        _, self._fisher_vector_product = self.get_vars_grads(
            gradient_vector_product, variables)

        self._set_theta = tf.group(*list_assigns)
        self._get_theta = tf.concat(
            axis=0,
            values=[tf.reshape(variable, (-1, )) for variable in variables])
        return self._losses, self._loss
Exemplo n.º 49
0
 def _kl_divergence(self, other):
   return kullback_leibler.kl_divergence(
       self, other, allow_nan_stats=self.allow_nan_stats)
    def test_score_trick(self):

        with self.test_session() as sess:
            d = 5  # Dimension
            num_draws = int(1e5)
            seed = 1

            p = mvn_full_lib.MultivariateNormalFullCovariance(
                covariance_matrix=self._tridiag(
                    d, diag_value=1, offdiag_value=0.5))

            # Variance is very high when approximating Forward KL, so we make
            # scale_diag larger than in test_kl_reverse_multidim. This ensures q
            # "covers" p and thus Var_q[p/q] is smaller.
            s = array_ops.constant(1.)
            q = mvn_diag_lib.MultivariateNormalDiag(
                scale_diag=array_ops.tile([s], [d]))

            approx_kl = cd.monte_carlo_csiszar_f_divergence(
                f=cd.kl_reverse, p=p, q=q, num_draws=num_draws, seed=seed)

            approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
                f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
                p=p,
                q=q,
                num_draws=num_draws,
                seed=seed)

            approx_kl_score_trick = cd.monte_carlo_csiszar_f_divergence(
                f=cd.kl_reverse,
                p=p,
                q=q,
                num_draws=num_draws,
                use_reparametrization=False,
                seed=seed)

            approx_kl_self_normalized_score_trick = (
                cd.monte_carlo_csiszar_f_divergence(
                    f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
                    p=p,
                    q=q,
                    num_draws=num_draws,
                    use_reparametrization=False,
                    seed=seed))

            exact_kl = kullback_leibler.kl_divergence(q, p)

            grad = lambda fs: gradients_impl.gradients(fs, s)[0]

            [
                approx_kl_,
                approx_kl_self_normalized_,
                approx_kl_score_trick_,
                approx_kl_self_normalized_score_trick_,
                exact_kl_,
            ] = sess.run([
                grad(approx_kl),
                grad(approx_kl_self_normalized),
                grad(approx_kl_score_trick),
                grad(approx_kl_self_normalized_score_trick),
                grad(exact_kl),
            ])

            self.assertAllClose(approx_kl_, exact_kl_, rtol=0.06, atol=0.)

            self.assertAllClose(approx_kl_self_normalized_,
                                exact_kl_,
                                rtol=0.05,
                                atol=0.)

            self.assertAllClose(approx_kl_score_trick_,
                                exact_kl_,
                                rtol=0.06,
                                atol=0.)

            self.assertAllClose(approx_kl_self_normalized_score_trick_,
                                exact_kl_,
                                rtol=0.05,
                                atol=0.)
def dense_variational(
    inputs,
    units,
    activation=None,
    activity_regularizer=None,
    trainable=True,
    kernel_use_local_reparameterization=True,
    kernel_posterior_fn=default_mean_field_normal_fn(),
    kernel_posterior_tensor_fn=lambda d: d.sample(),
    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
    bias_posterior_fn=default_mean_field_normal_fn(is_singular=True),
    bias_posterior_tensor_fn=lambda d: d.sample(),
    bias_prior_fn=None,
    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
    name=None,
    reuse=None):
  """Densely-connected variational layer.

  This layer implements the Bayesian variational inference analogue to:
  `outputs = activation(matmul(inputs, kernel) + bias)`
  by assuming the `kernel` and/or the `bias` are random variables.

  The layer implements a stochastic dense calculation by making a Monte Carlo
  approximation of a [variational Bayesian method based on KL divergence](
  https://en.wikipedia.org/wiki/Variational_Bayesian_methods), i.e.,

  ```none
  -log p(y|x) = -log int_{R**d} p(y|x,w) p(w) dw
              = -log int_{R**d} p(y,w|x) q(w|x) / q(w|x) dw
             <= E_q(W|x)[-log p(y,W|x) + log q(W|x)]       # Jensen's
              = E_q(W|x)[-log p(y|x,W)] + KL[q(W|x), p(W)]
             ~= m**-1 sum{ -log(y|x,w[j]) : w[j] ~ q(W|x), j=1..m }
                 + KL[q(W|x), p(W)]
  ```

  where `W` denotes the (independent) `kernel` and `bias` random variables, `w`
  is a random variate or outcome of `W`, `y` is the label, `x` is the evidence`,
  and `~=` denotes an approximation which becomes exact as `m->inf`. The above
  bound is sometimes referred to as the negative Evidence Lower BOund or
  negative [ELBO](https://arxiv.org/abs/1601.00670). In context of a DNN, this
  layer is appropriate to use when the final loss is a negative log-likelihood.

  The Monte-Carlo sum portion is used for the feed-forward calculation of the
  DNN. The KL divergence portion can be added to the final loss via:
  `loss += sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))`.

  The arguments permit separate specification of the surrogate posterior
  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
  random variables (which together comprise `W`).

  Args:
    inputs: Tensor input.
    units: Integer or Long, dimensionality of the output space.
    activation: Activation function (`callable`). Set it to None to maintain a
      linear activation.
    activity_regularizer: Regularizer function for the output.
    trainable: Boolean, if `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
    kernel_use_local_reparameterization: Python `bool` indicating whether
      `kernel` calculation should employ the Local Reparameterization Trick.
      When `True`, `kernel_posterior_fn` must create an instance of
      `tf.distributions.Normal`.
    kernel_posterior_fn: Python `callable` which creates
      `tf.distributions.Distribution` instance representing the surrogate
      posterior of the `kernel` parameter. Default value:
      `default_mean_field_normal_fn()`.
    kernel_posterior_tensor_fn: Python `callable` which takes a
      `tf.distributions.Distribution` instance and returns a representative
      value. Default value: `lambda d: d.sample()`.
    kernel_prior_fn: Python `callable` which creates `tf.distributions`
      instance. See `default_mean_field_normal_fn` docstring for required
      parameter signature.
      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
      distribution, prior distribution and random variate sample(s) from the
      surrogate posterior and computes or approximates the KL divergence. The
      distributions are `tf.distributions.Distribution`-like instances and the
      sample is a `Tensor`.
    bias_posterior_fn: Python `callable` which creates
      `tf.distributions.Distribution` instance representing the surrogate
      posterior of the `bias` parameter. Default value:
      `default_mean_field_normal_fn(is_singular=True)` (which creates an
      instance of `tf.distributions.Deterministic`).
    bias_posterior_tensor_fn: Python `callable` which takes a
      `tf.distributions.Distribution` instance and returns a representative
      value. Default value: `lambda d: d.sample()`.
    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
      See `default_mean_field_normal_fn` docstring for required parameter
      signature. Default value: `None` (no prior, no variational inference)
    bias_divergence_fn: Python `callable` which takes the surrogate posterior
      distribution, prior distribution and random variate sample(s) from the
      surrogate posterior and computes or approximates the KL divergence. The
      distributions are `tf.distributions.Distribution`-like instances and the
      sample is a `Tensor`.
    name: Python `str`, the name of the layer. Layers with the same name will
      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
      such cases.
    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
      layer by the same name.

  Returns:
    output: `Tensor` representing a the affine transformed input under a random
      draw from the surrogate posterior distribution.
  """
  layer = DenseVariational(
      units,
      activation=activation,
      activity_regularizer=activity_regularizer,
      trainable=trainable,
      kernel_use_local_reparameterization=(
          kernel_use_local_reparameterization),
      kernel_posterior_fn=kernel_posterior_fn,
      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
      kernel_prior_fn=kernel_prior_fn,
      kernel_divergence_fn=kernel_divergence_fn,
      bias_posterior_fn=bias_posterior_fn,
      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
      bias_prior_fn=bias_prior_fn,
      bias_divergence_fn=bias_divergence_fn,
      name=name,
      dtype=inputs.dtype.base_dtype,
      _scope=name,
      _reuse=reuse)
  return layer.apply(inputs)
def _elbo(form, log_likelihood, log_joint, variational_with_prior,
          keep_batch_dim):
    """Internal implementation of ELBO. Users should use `elbo`.

  Args:
    form: ELBOForms constant. Controls how the ELBO is computed.
    log_likelihood: `Tensor` log p(x|Z).
    log_joint: `Tensor` log p(x, Z).
    variational_with_prior: `dict<StochasticTensor, Distribution>`, varational
      distributions to prior distributions.
    keep_batch_dim: bool. Whether to keep the batch dimension when reducing
      the entropy/KL.

  Returns:
    ELBO `Tensor` with same shape and dtype as `log_likelihood`/`log_joint`.
  """
    ELBOForms.check_form(form)

    # Order of preference
    # 1. Analytic KL: log_likelihood - KL(q||p)
    # 2. Analytic entropy: log_likelihood + log p(Z) + H[q], or log_joint + H[q]
    # 3. Sample: log_likelihood - (log q(Z) - log p(Z)) =
    #            log_likelihood + log p(Z) - log q(Z), or log_joint - q(Z)

    def _reduce(val):
        if keep_batch_dim:
            return val
        else:
            return math_ops.reduce_sum(val)

    kl_terms = []
    entropy_terms = []
    prior_terms = []
    for q, z, p in [(qz.distribution, qz.value(), pz)
                    for qz, pz in variational_with_prior.items()]:
        # Analytic KL
        kl = None
        if log_joint is None and form in {
                ELBOForms.default, ELBOForms.analytic_kl
        }:
            try:
                kl = kullback_leibler.kl_divergence(q, p)
                logging.info("Using analytic KL between q:%s, p:%s", q, p)
            except NotImplementedError as e:
                if form == ELBOForms.analytic_kl:
                    raise e
        if kl is not None:
            kl_terms.append(-1. * _reduce(kl))
            continue

        # Analytic entropy
        entropy = None
        if form in {ELBOForms.default, ELBOForms.analytic_entropy}:
            try:
                entropy = q.entropy()
                logging.info("Using analytic entropy for q:%s", q)
            except NotImplementedError as e:
                if form == ELBOForms.analytic_entropy:
                    raise e
        if entropy is not None:
            entropy_terms.append(_reduce(entropy))
            if log_likelihood is not None:
                prior = p.log_prob(z)
                prior_terms.append(_reduce(prior))
            continue

        # Sample
        if form in {ELBOForms.default, ELBOForms.sample}:
            entropy = -q.log_prob(z)
            entropy_terms.append(_reduce(entropy))
            if log_likelihood is not None:
                prior = p.log_prob(z)
                prior_terms.append(_reduce(prior))

    first_term = log_joint if log_joint is not None else log_likelihood
    return sum([first_term] + kl_terms + entropy_terms + prior_terms)
Exemplo n.º 53
0
  def test_score_trick(self):

    with self.test_session() as sess:
      d = 5  # Dimension
      num_draws = int(1e5)
      seed = 1

      p = mvn_full_lib.MultivariateNormalFullCovariance(
          covariance_matrix=self._tridiag(d, diag_value=1, offdiag_value=0.5))

      # Variance is very high when approximating Forward KL, so we make
      # scale_diag larger than in test_kl_reverse_multidim. This ensures q
      # "covers" p and thus Var_q[p/q] is smaller.
      s = array_ops.constant(1.)
      q = mvn_diag_lib.MultivariateNormalDiag(
          scale_diag=array_ops.tile([s], [d]))

      approx_kl = cd.monte_carlo_csiszar_f_divergence(
          f=cd.kl_reverse,
          p=p,
          q=q,
          num_draws=num_draws,
          seed=seed)

      approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence(
          f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
          p=p,
          q=q,
          num_draws=num_draws,
          seed=seed)

      approx_kl_score_trick = cd.monte_carlo_csiszar_f_divergence(
          f=cd.kl_reverse,
          p=p,
          q=q,
          num_draws=num_draws,
          use_reparametrization=False,
          seed=seed)

      approx_kl_self_normalized_score_trick = (
          cd.monte_carlo_csiszar_f_divergence(
              f=lambda logu: cd.kl_reverse(logu, self_normalized=True),
              p=p,
              q=q,
              num_draws=num_draws,
              use_reparametrization=False,
              seed=seed))

      exact_kl = kullback_leibler.kl_divergence(q, p)

      grad = lambda fs: gradients_impl.gradients(fs, s)[0]

      [
          approx_kl_grad_,
          approx_kl_self_normalized_grad_,
          approx_kl_score_trick_grad_,
          approx_kl_self_normalized_score_trick_grad_,
          exact_kl_grad_,
          approx_kl_,
          approx_kl_self_normalized_,
          approx_kl_score_trick_,
          approx_kl_self_normalized_score_trick_,
          exact_kl_,
      ] = sess.run([
          grad(approx_kl),
          grad(approx_kl_self_normalized),
          grad(approx_kl_score_trick),
          grad(approx_kl_self_normalized_score_trick),
          grad(exact_kl),
          approx_kl,
          approx_kl_self_normalized,
          approx_kl_score_trick,
          approx_kl_self_normalized_score_trick,
          exact_kl,
      ])

      # Test average divergence.
      self.assertAllClose(approx_kl_, exact_kl_,
                          rtol=0.02, atol=0.)

      self.assertAllClose(approx_kl_self_normalized_, exact_kl_,
                          rtol=0.08, atol=0.)

      self.assertAllClose(approx_kl_score_trick_, exact_kl_,
                          rtol=0.02, atol=0.)

      self.assertAllClose(approx_kl_self_normalized_score_trick_, exact_kl_,
                          rtol=0.08, atol=0.)

      # Test average gradient-divergence.
      self.assertAllClose(approx_kl_grad_, exact_kl_grad_,
                          rtol=0.007, atol=0.)

      self.assertAllClose(approx_kl_self_normalized_grad_, exact_kl_grad_,
                          rtol=0.011, atol=0.)

      self.assertAllClose(approx_kl_score_trick_grad_, exact_kl_grad_,
                          rtol=0.018, atol=0.)

      self.assertAllClose(
          approx_kl_self_normalized_score_trick_grad_, exact_kl_grad_,
          rtol=0.017, atol=0.)
Exemplo n.º 54
0
def dense_local_reparameterization(
        inputs,
        units,
        activation=None,
        activity_regularizer=None,
        trainable=True,
        kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
        kernel_posterior_tensor_fn=lambda d: d.sample(),
        kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
            loc=dtype.as_numpy_dtype(0.),
            scale=dtype.as_numpy_dtype(1.)),
        kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
        bias_posterior_fn=layers_util.default_mean_field_normal_fn(
            is_singular=True),
        bias_posterior_tensor_fn=lambda d: d.sample(),
        bias_prior_fn=None,
        bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
        name=None,
        reuse=None):
    # pylint: disable=g-doc-args
    """Densely-connected layer with local reparameterization estimator.

  This layer implements the Bayesian variational inference analogue to
  a dense layer by assuming the `kernel` and/or the `bias` are drawn
  from distributions. By default, the layer implements a stochastic
  forward pass via sampling from the kernel and bias posteriors,

  ```none
  kernel, bias ~ posterior
  outputs = activation(matmul(inputs, kernel) + bias)
  ```

  It uses the local reparameterization estimator [1], which performs a
  Monte Carlo approximation of the distribution on the hidden units
  induced by the `kernel` and `bias`.

  The arguments permit separate specification of the surrogate posterior
  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
  distributions.

  Args:
    inputs: Tensor input.
    @{args}

  Returns:
    output: `Tensor` representing a the affine transformed input under a random
      draw from the surrogate posterior distribution.

  #### Examples

  We illustrate a Bayesian neural network with [variational inference](
  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
  assuming a dataset of `features` and `labels`.

  ```python
  tfp = tf.contrib.bayesflow

  net = tfp.layers.dense_local_reparameterization(
      features, 512, activation=tf.nn.relu)
  logits = tfp.layers.dense_local_reparameterization(net, 10)
  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
      labels=labels, logits=logits)
  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
  loss = neg_log_likelihood + kl
  train_op = tf.train.AdamOptimizer().minimize(loss)
  ```

  It uses local reparameterization gradients to minimize the
  Kullback-Leibler divergence up to a constant, also known as the
  negative Evidence Lower Bound. It consists of the sum of two terms:
  the expected negative log-likelihood, which we approximate via
  Monte Carlo; and the KL divergence, which is added via regularizer
  terms which are arguments to the layer.

  [1]: "Variational Dropout and the Local Reparameterization Trick."
        Diederik P. Kingma, Tim Salimans, Max Welling.
        Neural Information Processing Systems, 2015.
  """
    # pylint: enable=g-doc-args
    layer = DenseLocalReparameterization(
        units,
        activation=activation,
        activity_regularizer=activity_regularizer,
        trainable=trainable,
        kernel_posterior_fn=kernel_posterior_fn,
        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
        kernel_prior_fn=kernel_prior_fn,
        kernel_divergence_fn=kernel_divergence_fn,
        bias_posterior_fn=bias_posterior_fn,
        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
        bias_prior_fn=bias_prior_fn,
        bias_divergence_fn=bias_divergence_fn,
        name=name,
        dtype=inputs.dtype.base_dtype,
        _scope=name,
        _reuse=reuse)
    return layer.apply(inputs)
Exemplo n.º 55
0
def dense_variational(
        inputs,
        units,
        activation=None,
        activity_regularizer=None,
        trainable=True,
        kernel_use_local_reparameterization=True,
        kernel_posterior_fn=default_mean_field_normal_fn(),
        kernel_posterior_tensor_fn=lambda d: d.sample(),
        kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
            loc=dtype.as_numpy_dtype(0.),
            scale=dtype.as_numpy_dtype(1.)),
        kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
        bias_posterior_fn=default_mean_field_normal_fn(is_singular=True),
        bias_posterior_tensor_fn=lambda d: d.sample(),
        bias_prior_fn=None,
        bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
        name=None,
        reuse=None):
    """Densely-connected variational layer.

  This layer implements the Bayesian variational inference analogue to:
  `outputs = activation(matmul(inputs, kernel) + bias)`
  by assuming the `kernel` and/or the `bias` are random variables.

  The layer implements a stochastic dense calculation by making a Monte Carlo
  approximation of a [variational Bayesian method based on KL divergence](
  https://en.wikipedia.org/wiki/Variational_Bayesian_methods), i.e.,

  ```none
  -log p(y|x) = -log int_{R**d} p(y|x,w) p(w) dw
              = -log int_{R**d} p(y,w|x) q(w|x) / q(w|x) dw
             <= E_q(W|x)[-log p(y,W|x) + log q(W|x)]       # Jensen's
              = E_q(W|x)[-log p(y|x,W)] + KL[q(W|x), p(W)]
             ~= m**-1 sum{ -log(y|x,w[j]) : w[j] ~ q(W|x), j=1..m }
                 + KL[q(W|x), p(W)]
  ```

  where `W` denotes the (independent) `kernel` and `bias` random variables, `w`
  is a random variate or outcome of `W`, `y` is the label, `x` is the evidence`,
  and `~=` denotes an approximation which becomes exact as `m->inf`. The above
  bound is sometimes referred to as the negative Evidence Lower BOund or
  negative [ELBO](https://arxiv.org/abs/1601.00670). In context of a DNN, this
  layer is appropriate to use when the final loss is a negative log-likelihood.

  The Monte-Carlo sum portion is used for the feed-forward calculation of the
  DNN. The KL divergence portion can be added to the final loss via:
  `loss += sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))`.

  The arguments permit separate specification of the surrogate posterior
  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
  random variables (which together comprise `W`).

  Args:
    inputs: Tensor input.
    units: Integer or Long, dimensionality of the output space.
    activation: Activation function (`callable`). Set it to None to maintain a
      linear activation.
    activity_regularizer: Regularizer function for the output.
    trainable: Boolean, if `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
    kernel_use_local_reparameterization: Python `bool` indicating whether
      `kernel` calculation should employ the Local Reparameterization Trick.
      When `True`, `kernel_posterior_fn` must create an instance of
      `tf.distributions.Normal`.
    kernel_posterior_fn: Python `callable` which creates
      `tf.distributions.Distribution` instance representing the surrogate
      posterior of the `kernel` parameter. Default value:
      `default_mean_field_normal_fn()`.
    kernel_posterior_tensor_fn: Python `callable` which takes a
      `tf.distributions.Distribution` instance and returns a representative
      value. Default value: `lambda d: d.sample()`.
    kernel_prior_fn: Python `callable` which creates `tf.distributions`
      instance. See `default_mean_field_normal_fn` docstring for required
      parameter signature.
      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
      distribution, prior distribution and random variate sample(s) from the
      surrogate posterior and computes or approximates the KL divergence. The
      distributions are `tf.distributions.Distribution`-like instances and the
      sample is a `Tensor`.
    bias_posterior_fn: Python `callable` which creates
      `tf.distributions.Distribution` instance representing the surrogate
      posterior of the `bias` parameter. Default value:
      `default_mean_field_normal_fn(is_singular=True)` (which creates an
      instance of `tf.distributions.Deterministic`).
    bias_posterior_tensor_fn: Python `callable` which takes a
      `tf.distributions.Distribution` instance and returns a representative
      value. Default value: `lambda d: d.sample()`.
    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
      See `default_mean_field_normal_fn` docstring for required parameter
      signature. Default value: `None` (no prior, no variational inference)
    bias_divergence_fn: Python `callable` which takes the surrogate posterior
      distribution, prior distribution and random variate sample(s) from the
      surrogate posterior and computes or approximates the KL divergence. The
      distributions are `tf.distributions.Distribution`-like instances and the
      sample is a `Tensor`.
    name: Python `str`, the name of the layer. Layers with the same name will
      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
      such cases.
    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
      layer by the same name.

  Returns:
    output: `Tensor` representing a the affine transformed input under a random
      draw from the surrogate posterior distribution.
  """
    layer = DenseVariational(
        units,
        activation=activation,
        activity_regularizer=activity_regularizer,
        trainable=trainable,
        kernel_use_local_reparameterization=(
            kernel_use_local_reparameterization),
        kernel_posterior_fn=kernel_posterior_fn,
        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
        kernel_prior_fn=kernel_prior_fn,
        kernel_divergence_fn=kernel_divergence_fn,
        bias_posterior_fn=bias_posterior_fn,
        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
        bias_prior_fn=bias_prior_fn,
        bias_divergence_fn=bias_divergence_fn,
        name=name,
        dtype=inputs.dtype.base_dtype,
        _scope=name,
        _reuse=reuse)
    return layer.apply(inputs)