Пример #1
0
 def test_all_finite_raises(self):
     with self.test_session():
         x = np.inf * tf.constant([-1.0, -2.0, -3.0, -4.0])
         with self.assertRaisesOpError('Inf'):
             log_sum_exp(x).eval()
         x = tf.constant([-1.0, np.nan, -3.0, -4.0])
         with self.assertRaisesOpError('NaN'):
             log_sum_exp(x).eval()                
Пример #2
0
 def test_log_sum_exp_2d(self):
     with self.test_session():
         x = tf.constant([[-1.0], [-2.0], [-3.0], [-4.0]])
         self.assertAllClose(log_sum_exp(x).eval(), -0.5598103014388045)
         x = tf.constant([[-1.0, -2.0], [-3.0, -4.0]])
         self.assertAllClose(log_sum_exp(x).eval(), -0.5598103014388045)
         self.assertAllClose(
             log_sum_exp(x, 0).eval(),
             np.array([-0.87307198895702742, -1.8730719889570275]))
         self.assertAllClose(
             log_sum_exp(x, 1).eval(),
             np.array([-0.68673831248177708, -2.6867383124817774]))
Пример #3
0
 def test_log_sum_exp_2d(self):
     with self.test_session():
         x = tf.constant([[-1.0], [-2.0], [-3.0], [-4.0]])
         self.assertAllClose(log_sum_exp(x).eval(),
                             -0.5598103014388045)
         x = tf.constant([[-1.0, -2.0], [-3.0, -4.0]])
         self.assertAllClose(log_sum_exp(x).eval(),
                             -0.5598103014388045)
         self.assertAllClose(log_sum_exp(x, 0).eval(),
                             np.array([-0.87307198895702742,
                                       -1.8730719889570275]))
         self.assertAllClose(log_sum_exp(x, 1).eval(),
                             np.array([-0.68673831248177708,
                                       -2.6867383124817774]))
Пример #4
0
    def build_loss(self):
        """
        Loss function to minimize, whose gradient is a stochastic
        gradient inspired by adaptive importance sampling.
        """
        # loss = E_{q(z; lambda)} [ w_norm(z; lambda) *
        #                           ( log p(x, z) - log q(z; lambda) ) ]
        # where
        # w_norm(z; lambda) = w(z; lambda) / sum_z( w(z; lambda) )
        # w(z; lambda) = p(x, z) / q(z; lambda)
        #
        # gradient = - E_{q(z; lambda)} [ w_norm(z; lambda) *
        #                                 grad_{lambda} log q(z; lambda) ]
        x = self.data.sample(self.n_data)
        z, self.samples = self.variational.sample(x, self.n_minibatch)

        q_log_prob = tf.zeros([self.n_minibatch], dtype=tf.float32)
        for i in range(self.variational.num_factors):
            q_log_prob += self.variational.log_prob_zi(i, z)

        # 1/B sum_{b=1}^B grad_log_q * w_norm
        # = 1/B sum_{b=1}^B grad_log_q * exp{ log(w_norm) }
        log_w = self.model.log_prob(x, z) - q_log_prob

        # normalized log importance weights
        log_w_norm = log_w - log_sum_exp(log_w)
        w_norm = tf.exp(log_w_norm)

        self.loss = tf.reduce_mean(w_norm * log_w)
        return -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm))
Пример #5
0
    def log_prob(self, xs, zs):
        """Returns a vector [log p(xs, zs[1,:]), ..., log p(xs, zs[S,:])]."""
        x = xs['x']
        pi, mus, sigmas = self.unpack_params(zs)
        log_prior = dirichlet.logpdf(pi, self.alpha)
        log_prior += tf.reduce_sum(norm.logpdf(mus, 0, np.sqrt(self.c)))
        log_prior += tf.reduce_sum(invgamma.logpdf(sigmas, self.a, self.b))

        # Loop over each sample zs[s, :].
        log_lik = []
        N = get_dims(x)[0]
        n_samples = get_dims(pi)[0]
        for s in range(n_samples):
            # log-likelihood is
            # sum_{n=1}^N log sum_{k=1}^K exp( log pi_k + log N(x_n; mu_k, sigma_k) )
            # Create a K x N matrix, whose entry (k, n) is
            # log pi_k + log N(x_n; mu_k, sigma_k).
            matrix = []
            for k in range(self.K):
                matrix += [
                    tf.ones(N) * tf.log(pi[s, k]) + multivariate_normal.logpdf(
                        x, mus[s, (k * self.D):((k + 1) * self.D)],
                        sigmas[s, (k * self.D):((k + 1) * self.D)])
                ]

            matrix = tf.pack(matrix)
            # log_sum_exp() along the rows is a vector, whose nth
            # element is the log-likelihood of data point x_n.
            vector = log_sum_exp(matrix, 0)
            # Sum over data points to get the full log-likelihood.
            log_lik_z = tf.reduce_sum(vector)
            log_lik += [log_lik_z]

        return log_prior + tf.pack(log_lik)
Пример #6
0
    def log_prob(self, xs, zs):
        """Return a vector [log p(xs, zs[1,:]), ..., log p(xs, zs[S,:])]."""
        x = xs['x']
        pi, mus, sigmas = zs
        log_prior = dirichlet.logpdf(pi, self.alpha)
        log_prior += tf.reduce_sum(norm.logpdf(mus, 0, np.sqrt(self.c)), 1)
        log_prior += tf.reduce_sum(invgamma.logpdf(sigmas, self.a, self.b), 1)

        # Loop over each sample zs[s, :].
        log_lik = []
        N = get_dims(x)[0]
        n_samples = get_dims(pi)[0]
        for s in range(n_samples):
            # log-likelihood is
            # sum_{n=1}^N log sum_{k=1}^K exp( log pi_k + log N(x_n; mu_k, sigma_k) )
            # Create a K x N matrix, whose entry (k, n) is
            # log pi_k + log N(x_n; mu_k, sigma_k).
            matrix = []
            for k in range(self.K):
                matrix += [tf.ones(N)*tf.log(pi[s, k]) +
                           multivariate_normal.logpdf(x,
                               mus[s, (k*self.D):((k+1)*self.D)],
                               sigmas[s, (k*self.D):((k+1)*self.D)])]

            matrix = tf.pack(matrix)
            # log_sum_exp() along the rows is a vector, whose nth
            # element is the log-likelihood of data point x_n.
            vector = log_sum_exp(matrix, 0)
            # Sum over data points to get the full log-likelihood.
            log_lik_z = tf.reduce_sum(vector)
            log_lik += [log_lik_z]

        return log_prior + tf.pack(log_lik)
    def log_prob(self, xs, zs):
        """Return scalar, the log joint density log p(xs, zs)."""
        x = xs["x"]
        pi, mus, sigmas = zs["pi"], zs["mu"], zs["sigma"]
        log_prior = dirichlet.logpdf(pi, self.alpha)
        log_prior += tf.reduce_sum(norm.logpdf(mus, 0.0, self.c))
        log_prior += tf.reduce_sum(invgamma.logpdf(sigmas, self.a, self.b))

        # log-likelihood is
        # sum_{n=1}^N log sum_{k=1}^K exp( log pi_k + log N(x_n; mu_k, sigma_k) )
        # Create a K x N matrix, whose entry (k, n) is
        # log pi_k + log N(x_n; mu_k, sigma_k).
        N = get_dims(x)[0]
        matrix = []
        for k in range(self.K):
            matrix += [
                tf.ones(N) * tf.log(pi[k])
                + multivariate_normal_diag.logpdf(
                    x, mus[(k * self.D) : ((k + 1) * self.D)], sigmas[(k * self.D) : ((k + 1) * self.D)]
                )
            ]

        matrix = tf.pack(matrix)
        # log_sum_exp() along the rows is a vector, whose nth
        # element is the log-likelihood of data point x_n.
        vector = log_sum_exp(matrix, 0)
        # Sum over data points to get the full log-likelihood.
        log_lik = tf.reduce_sum(vector)

        return log_prior + log_lik
Пример #8
0
    def build_loss(self):
        """
        Loss function to minimize, whose gradient is a stochastic
        gradient inspired by adaptive importance sampling.
        """
        # loss = E_{q(z; lambda)} [ w_norm(z; lambda) *
        #                           ( log p(x, z) - log q(z; lambda) ) ]
        # where
        # w_norm(z; lambda) = w(z; lambda) / sum_z( w(z; lambda) )
        # w(z; lambda) = p(x, z) / q(z; lambda)
        #
        # gradient = - E_{q(z; lambda)} [ w_norm(z; lambda) *
        #                                 grad_{lambda} log q(z; lambda) ]
        x = self.data.sample(self.n_data)
        self.variational.set_params(self.variational.mapping(x))

        q_log_prob = tf.zeros([self.n_minibatch], dtype=tf.float32)
        for i in range(self.variational.num_factors):
            q_log_prob += self.variational.log_prob_zi(i, self.samples)

        # 1/B sum_{b=1}^B grad_log_q * w_norm
        # = 1/B sum_{b=1}^B grad_log_q * exp{ log(w_norm) }
        log_w = self.model.log_prob(x, self.samples) - q_log_prob

        # normalized log importance weights
        log_w_norm = log_w - log_sum_exp(log_w)
        w_norm = tf.exp(log_w_norm)

        self.losses = w_norm * log_w
        return -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm))
Пример #9
0
  def log_prob(self, xs, zs):
    """Return scalar, the log joint density log p(xs, zs)."""
    x = xs['x']
    pi, mus, sigmas = zs['pi'], zs['mu'], zs['sigma']
    log_prior = dirichlet.logpdf(pi, self.alpha)
    log_prior += tf.reduce_sum(norm.logpdf(mus, 0.0, self.c))
    log_prior += tf.reduce_sum(invgamma.logpdf(sigmas, self.a, self.b))

    # log-likelihood is
    # sum_{n=1}^N log sum_{k=1}^K exp( log pi_k + log N(x_n; mu_k, sigma_k) )
    # Create a K x N matrix, whose entry (k, n) is
    # log pi_k + log N(x_n; mu_k, sigma_k).
    N = get_dims(x)[0]
    matrix = []
    for k in range(self.K):
      matrix += [tf.ones(N) * tf.log(pi[k]) +
                 multivariate_normal_diag.logpdf(x,
                 mus[(k * self.D):((k + 1) * self.D)],
                 sigmas[(k * self.D):((k + 1) * self.D)])]

    matrix = tf.pack(matrix)
    # log_sum_exp() along the rows is a vector, whose nth
    # element is the log-likelihood of data point x_n.
    vector = log_sum_exp(matrix, 0)
    # Sum over data points to get the full log-likelihood.
    log_lik = tf.reduce_sum(vector)

    return log_prior + log_lik
Пример #10
0
    def build_loss(self):
        """
        Loss function to minimize, whose gradient is a stochastic
        gradient inspired by adaptive importance sampling.

        loss = E_{p(z | x)} [ log p(z | x) - log q(z; lambda) ]

        is equivalent to minimizing

        E_{p(z | x)} [ log p(x, z) - log q(z; lambda) ]
        \approx 1/B sum_{b=1}^B
            w_norm(z^b; lambda) (log p(x, z^b) - log q(z^b; lambda))

        with gradient
        \approx - 1/B sum_{b=1}^B
            w_norm(z^b; lambda) grad_{lambda} log q(z^b; lambda)

        where + z^b ~ q(z^b; lambda)
              + w_norm(z^b; lambda) = w(z^b; lambda) / sum_{b=1}^B w(z^b; lambda)
              + w(z^b; lambda) = p(x, z^b) / q(z^b; lambda)
        """
        x = self.data.sample(self.n_data)
        z, self.samples = self.variational.sample(self.n_minibatch)

        q_log_prob = tf.zeros([self.n_minibatch], dtype=tf.float32)
        for i in range(self.variational.num_factors):
            q_log_prob += self.variational.log_prob_i(i, tf.stop_gradient(z))

        # normalized importance weights
        log_w = self.model.log_prob(x, z) - q_log_prob
        log_w_norm = log_w - log_sum_exp(log_w)
        w_norm = tf.exp(log_w_norm)

        self.loss = tf.reduce_mean(w_norm * log_w)
        return -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm))
Пример #11
0
    def test_accuracy(self):
        sess = tf.InteractiveSession()

        x = tf.constant([[-1.0], [-2.0], [-3.0], [-4.0]])
        result = log_sum_exp(x)

        hand_derived_result = -0.5598103014388045

        self.assertAlmostEqual(result.eval(), hand_derived_result)
Пример #12
0
    def test_accuracy(self):
        sess = tf.InteractiveSession()

        x = tf.constant([[-1.0], [-2.0], [-3.0], [-4.0]])
        result = log_sum_exp(x)

        hand_derived_result = -0.5598103014388045

        self.assertAlmostEqual(result.eval(), hand_derived_result)
Пример #13
0
    def log_prob(self, xs, zs):
        """
    Return scalar, the log joint density log p(xs, zs).

    Given n_minibatch data points, n_samples of variables
    Summing over the datapoints makes sense since the joint is the only place in the 
    estiamtion of the gradient that has the data points, and its the log, so we can sum
    over them
    BUT summing over the variables doenst make sense,its supposed to be one at a time

    """
        x = xs['x']
        pi, mus, sigmas = zs['pi'], zs['mu'], zs['sigma']

        # print(get_dims(x)) #[n_minibatch, D]
        # print(get_dims(pi)) #[K]
        # print(get_dims(mus)) #[K*D]
        # print(get_dims(sigmas)) #[K*D]

        log_prior = dirichlet.logpdf(pi, self.alpha)
        log_prior += tf.reduce_sum(norm.logpdf(mus, 0.0, self.c))
        log_prior += tf.reduce_sum(invgamma.logpdf(sigmas, self.a, self.b))

        # log-likelihood is
        # sum_{n=1}^N log sum_{k=1}^K exp( log pi_k + log N(x_n; mu_k, sigma_k) )
        # Create a K x N matrix, whose entry (k, n) is
        # log pi_k + log N(x_n; mu_k, sigma_k).
        n_minibatch = get_dims(x)[
            0]  #this is [n_minibatch, D], with [0] its just n_minibatch
        #OH I think they compute the matrix so that they can do log sum exp, since they need to find the max value

        matrix = []
        for k in range(self.K):

            # bbbb = tf.log(pi[k])
            # print(get_dims(bbbb))
            # aaaa= multivariate_normal_diag.logpdf(x,  mus[(k * self.D):((k + 1) * self.D)],  sigmas[(k * self.D):((k + 1) * self.D)])
            # print(get_dims(aaaa))
            # fadad

            matrix += [
                tf.ones(n_minibatch) * tf.log(pi[k]) +
                multivariate_normal_diag.logpdf(
                    x, mus[(k * self.D):((k + 1) * self.D)],
                    sigmas[(k * self.D):((k + 1) * self.D)])
            ]

        matrix = tf.pack(matrix)
        # log_sum_exp() along the rows is a vector, whose nth
        # element is the log-likelihood of data point x_n.
        vector = log_sum_exp(matrix, 0)
        # Sum over data points to get the full log-likelihood.
        log_lik = tf.reduce_sum(vector)

        return log_prior + log_lik
Пример #14
0
    def build_loss(self):
        """Loss function to minimize.

        Defines a stochastic gradient of

        .. math::
            KL( p(z |x) || q(z) )
            =
            E_{p(z | x)} [ \log p(z | x) - \log q(z; \lambda) ]

        based on importance sampling.

        Computed as

        .. math::
            1/B \sum_{b=1}^B [ w_{norm}(z^b; \lambda) *
                                (\log p(x, z^b) - \log q(z^b; \lambda) ]

        where

        .. math::
            z^b \sim q(z^b; \lambda)

            w_{norm}(z^b; \lambda) = w(z^b; \lambda) / \sum_{b=1}^B ( w(z^b; \lambda) )

            w(z^b; \lambda) = p(x, z^b) / q(z^b; \lambda)

        which gives a gradient

        .. math::
            - 1/B \sum_{b=1}^B
            w_{norm}(z^b; \lambda) \partial_{\lambda} \log q(z^b; \lambda)

        """
        x = self.data.sample(self.n_data)
        self.zs = self.variational.sample(self.n_minibatch)
        z = self.zs

        # normalized importance weights
        q_log_prob = self.variational.log_prob(stop_gradient(z))
        log_w = self.model.log_prob(x, z) - q_log_prob
        log_w_norm = log_w - log_sum_exp(log_w)
        w_norm = tf.exp(log_w_norm)

        self.loss = tf.reduce_mean(w_norm * log_w)
        return -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm))
Пример #15
0
    def build_loss(self):
        """Build loss function. Its automatic differentiation
        is a stochastic gradient of

        .. math::
            KL( p(z |x) || q(z) )
            =
            E_{p(z | x)} [ \log p(z | x) - \log q(z; \lambda) ]

        based on importance sampling.

        Computed as

        .. math::
            1/B \sum_{b=1}^B [ w_{norm}(z^b; \lambda) *
                                (\log p(x, z^b) - \log q(z^b; \lambda) ]

        where

        .. math::
            z^b \sim q(z^b; \lambda)

            w_{norm}(z^b; \lambda) = w(z^b; \lambda) / \sum_{b=1}^B ( w(z^b; \lambda) )

            w(z^b; \lambda) = p(x, z^b) / q(z^b; \lambda)

        which gives a gradient

        .. math::
            - 1/B \sum_{b=1}^B
            w_{norm}(z^b; \lambda) \partial_{\lambda} \log q(z^b; \lambda)

        """
        x = self.data
        z = self.variational.sample(self.n_samples)

        # normalized importance weights
        q_log_prob = self.variational.log_prob(stop_gradient(z))
        log_w = self.model.log_prob(x, z) - q_log_prob
        log_w_norm = log_w - log_sum_exp(log_w)
        w_norm = tf.exp(log_w_norm)

        self.loss = tf.reduce_mean(w_norm * log_w)
        return -tf.reduce_mean(q_log_prob * stop_gradient(w_norm))
Пример #16
0
    def build_loss_and_gradients(self, var_list):
        """Build loss function

    .. math::
      KL( p(z |x) || q(z) )
      =
      E_{p(z | x)} [ \log p(z | x) - \log q(z; \lambda) ]

    and stochastic gradients based on importance sampling.

    The loss function can be estimated as

    .. math::
      1/B \sum_{b=1}^B [ w_{norm}(z^b; \lambda) *
                         (\log p(x, z^b) - \log q(z^b; \lambda) ],

    where

    .. math::
      z^b \sim q(z^b; \lambda),

      w_{norm}(z^b; \lambda) = w(z^b; \lambda) / \sum_{b=1}^B (w(z^b; \lambda)),

      w(z^b; \lambda) = p(x, z^b) / q(z^b; \lambda).

    This provides a gradient,

    .. math::
      - 1/B \sum_{b=1}^B [ w_{norm}(z^b; \lambda) *
                           \partial_{\lambda} \log q(z^b; \lambda) ].
    """
        p_log_prob = [0.0] * self.n_samples
        q_log_prob = [0.0] * self.n_samples
        for s in range(self.n_samples):
            z_sample = {}
            for z, qz in six.iteritems(self.latent_vars):
                # Copy q(z) to obtain new set of posterior samples.
                qz_copy = copy(qz, scope='inference_' + str(s))
                z_sample[z] = qz_copy.value()
                q_log_prob[s] += tf.reduce_sum(
                    qz.log_prob(tf.stop_gradient(z_sample[z])))

            if self.model_wrapper is None:
                # Form dictionary in order to replace conditioning on prior or
                # observed variable with conditioning on a specific value.
                dict_swap = z_sample
                for x, qx in six.iteritems(self.data):
                    if isinstance(x, RandomVariable):
                        if isinstance(qx, RandomVariable):
                            qx_copy = copy(qx, scope='inference_' + str(s))
                            dict_swap[x] = qx_copy.value()
                        else:
                            dict_swap[x] = qx

                for z in six.iterkeys(self.latent_vars):
                    z_copy = copy(z, dict_swap, scope='inference_' + str(s))
                    p_log_prob[s] += tf.reduce_sum(
                        z_copy.log_prob(dict_swap[z]))

                for x in six.iterkeys(self.data):
                    if isinstance(x, RandomVariable):
                        x_copy = copy(x,
                                      dict_swap,
                                      scope='inference_' + str(s))
                        p_log_prob[s] += tf.reduce_sum(
                            x_copy.log_prob(dict_swap[x]))
            else:
                x = self.data
                p_log_prob[s] = self.model_wrapper.log_prob(x, z_sample)

        p_log_prob = tf.pack(p_log_prob)
        q_log_prob = tf.pack(q_log_prob)

        log_w = p_log_prob - q_log_prob
        log_w_norm = log_w - log_sum_exp(log_w)
        w_norm = tf.exp(log_w_norm)

        loss = tf.reduce_mean(w_norm * log_w)
        grads = tf.gradients(
            -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm)),
            [v.ref() for v in var_list])
        grads_and_vars = list(zip(grads, var_list))
        return loss, grads_and_vars
Пример #17
0
  def build_loss_and_gradients(self, var_list):
    """Build loss function

    .. math::
      \\text{KL}( p(z \mid x) || q(z) )
      = \mathbb{E}_{p(z \mid x)} [ \log p(z \mid x) - \log q(z; \lambda) ]

    and stochastic gradients based on importance sampling.

    The loss function can be estimated as

    .. math::
      \\frac{1}{B} \sum_{b=1}^B [
        w_{norm}(z^b; \lambda) (\log p(x, z^b) - \log q(z^b; \lambda) ],

    where for :math:`z^b \sim q(z^b; \lambda)`,

    .. math::

      w_{norm}(z^b; \lambda) = w(z^b; \lambda) / \sum_{b=1}^B w(z^b; \lambda)

    normalizes the importance weights, :math:`w(z^b; \lambda) = p(x,
    z^b) / q(z^b; \lambda)`.

    This provides a gradient,

    .. math::
      - \\frac{1}{B} \sum_{b=1}^B [
        w_{norm}(z^b; \lambda) \\nabla_{\lambda} \log q(z^b; \lambda) ].
    """
    p_log_prob = [0.0] * self.n_samples
    q_log_prob = [0.0] * self.n_samples
    for s in range(self.n_samples):
      scope = 'inference_' + str(id(self)) + '/' + str(s)
      z_sample = {}
      for z, qz in six.iteritems(self.latent_vars):
        # Copy q(z) to obtain new set of posterior samples.
        qz_copy = copy(qz, scope=scope)
        z_sample[z] = qz_copy.value()
        q_log_prob[s] += tf.reduce_sum(
            qz.log_prob(tf.stop_gradient(z_sample[z])))

      if self.model_wrapper is None:
        # Form dictionary in order to replace conditioning on prior or
        # observed variable with conditioning on a specific value.
        dict_swap = z_sample
        for x, qx in six.iteritems(self.data):
          if isinstance(x, RandomVariable):
            if isinstance(qx, RandomVariable):
              qx_copy = copy(qx, scope=scope)
              dict_swap[x] = qx_copy.value()
            else:
              dict_swap[x] = qx

        for z in six.iterkeys(self.latent_vars):
          z_copy = copy(z, dict_swap, scope=scope)
          p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(dict_swap[z]))

        for x in six.iterkeys(self.data):
          if isinstance(x, RandomVariable):
            x_copy = copy(x, dict_swap, scope=scope)
            p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(dict_swap[x]))
      else:
        x = self.data
        p_log_prob[s] = self.model_wrapper.log_prob(x, z_sample)

    p_log_prob = tf.pack(p_log_prob)
    q_log_prob = tf.pack(q_log_prob)

    log_w = p_log_prob - q_log_prob
    log_w_norm = log_w - log_sum_exp(log_w)
    w_norm = tf.exp(log_w_norm)

    if var_list is None:
      var_list = tf.trainable_variables()

    loss = tf.reduce_mean(w_norm * log_w)
    grads = tf.gradients(
        -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm)),
        [v.ref() for v in var_list])
    grads_and_vars = list(zip(grads, var_list))
    return loss, grads_and_vars
Пример #18
0
 def test_log_sum_exp_2d(self):
     with self.test_session():
         x = tf.constant([[-1.0], [-2.0], [-3.0], [-4.0]])
         self.assertAllClose(log_sum_exp(x).eval(), 
                             -0.5598103014388045)             
Пример #19
0
    def build_loss(self):
        """Build loss function. Its automatic differentiation
    is a stochastic gradient of

    .. math::
      KL( p(z |x) || q(z) )
      =
      E_{p(z | x)} [ \log p(z | x) - \log q(z; \lambda) ]

    based on importance sampling.

    Computed as

    .. math::
      1/B \sum_{b=1}^B [ w_{norm}(z^b; \lambda) *
                (\log p(x, z^b) - \log q(z^b; \lambda) ]

    where

    .. math::
      z^b \sim q(z^b; \lambda)

      w_{norm}(z^b; \lambda) = w(z^b; \lambda) / \sum_{b=1}^B (w(z^b; \lambda))

      w(z^b; \lambda) = p(x, z^b) / q(z^b; \lambda)

    which gives a gradient

    .. math::
      - 1/B \sum_{b=1}^B
      w_{norm}(z^b; \lambda) \partial_{\lambda} \log q(z^b; \lambda)

    """
        p_log_prob = [0.0] * self.n_samples
        q_log_prob = [0.0] * self.n_samples
        for s in range(self.n_samples):
            z_sample = {}
            for z, qz in six.iteritems(self.latent_vars):
                # Copy q(z) to obtain new set of posterior samples.
                qz_copy = copy(qz, scope='inference_' + str(s))
                z_sample[z] = qz_copy.value()
                q_log_prob[s] += tf.reduce_sum(
                    qz.log_prob(tf.stop_gradient(z_sample[z])))

            if self.model_wrapper is None:
                for z in six.iterkeys(self.latent_vars):
                    # Copy p(z), replacing any conditioning on prior with
                    # conditioning on posterior sample.
                    z_copy = copy(z,
                                  dict_swap=z_sample,
                                  scope='inference_' + str(s))
                    p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(
                        z_sample[z]))

                for x, obs in six.iteritems(self.data):
                    if isinstance(x, RandomVariable):
                        # Copy p(x | z), replacing any conditioning on prior with
                        # conditioning on posterior sample.
                        x_copy = copy(x,
                                      dict_swap=z_sample,
                                      scope='inference_' + str(s))
                        p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(obs))
            else:
                x = self.data
                p_log_prob[s] = self.model_wrapper.log_prob(x, z_sample)

        p_log_prob = tf.pack(p_log_prob)
        q_log_prob = tf.pack(q_log_prob)

        log_w = p_log_prob - q_log_prob
        log_w_norm = log_w - log_sum_exp(log_w)
        w_norm = tf.exp(log_w_norm)

        self.loss = tf.reduce_mean(w_norm * log_w)
        return -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm))
Пример #20
0
def test_2d():
    x = tf.constant([[-1.0], [-2.0], [-3.0], [-4.0]])
    val_ed = log_sum_exp(x)
    val_true = -0.5598103014388045
    assert np.allclose(val_ed.eval(), val_true)