def test_all_finite_raises(self): with self.test_session(): x = np.inf * tf.constant([-1.0, -2.0, -3.0, -4.0]) with self.assertRaisesOpError('Inf'): log_sum_exp(x).eval() x = tf.constant([-1.0, np.nan, -3.0, -4.0]) with self.assertRaisesOpError('NaN'): log_sum_exp(x).eval()
def test_log_sum_exp_2d(self): with self.test_session(): x = tf.constant([[-1.0], [-2.0], [-3.0], [-4.0]]) self.assertAllClose(log_sum_exp(x).eval(), -0.5598103014388045) x = tf.constant([[-1.0, -2.0], [-3.0, -4.0]]) self.assertAllClose(log_sum_exp(x).eval(), -0.5598103014388045) self.assertAllClose( log_sum_exp(x, 0).eval(), np.array([-0.87307198895702742, -1.8730719889570275])) self.assertAllClose( log_sum_exp(x, 1).eval(), np.array([-0.68673831248177708, -2.6867383124817774]))
def test_log_sum_exp_2d(self): with self.test_session(): x = tf.constant([[-1.0], [-2.0], [-3.0], [-4.0]]) self.assertAllClose(log_sum_exp(x).eval(), -0.5598103014388045) x = tf.constant([[-1.0, -2.0], [-3.0, -4.0]]) self.assertAllClose(log_sum_exp(x).eval(), -0.5598103014388045) self.assertAllClose(log_sum_exp(x, 0).eval(), np.array([-0.87307198895702742, -1.8730719889570275])) self.assertAllClose(log_sum_exp(x, 1).eval(), np.array([-0.68673831248177708, -2.6867383124817774]))
def build_loss(self): """ Loss function to minimize, whose gradient is a stochastic gradient inspired by adaptive importance sampling. """ # loss = E_{q(z; lambda)} [ w_norm(z; lambda) * # ( log p(x, z) - log q(z; lambda) ) ] # where # w_norm(z; lambda) = w(z; lambda) / sum_z( w(z; lambda) ) # w(z; lambda) = p(x, z) / q(z; lambda) # # gradient = - E_{q(z; lambda)} [ w_norm(z; lambda) * # grad_{lambda} log q(z; lambda) ] x = self.data.sample(self.n_data) z, self.samples = self.variational.sample(x, self.n_minibatch) q_log_prob = tf.zeros([self.n_minibatch], dtype=tf.float32) for i in range(self.variational.num_factors): q_log_prob += self.variational.log_prob_zi(i, z) # 1/B sum_{b=1}^B grad_log_q * w_norm # = 1/B sum_{b=1}^B grad_log_q * exp{ log(w_norm) } log_w = self.model.log_prob(x, z) - q_log_prob # normalized log importance weights log_w_norm = log_w - log_sum_exp(log_w) w_norm = tf.exp(log_w_norm) self.loss = tf.reduce_mean(w_norm * log_w) return -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm))
def log_prob(self, xs, zs): """Returns a vector [log p(xs, zs[1,:]), ..., log p(xs, zs[S,:])].""" x = xs['x'] pi, mus, sigmas = self.unpack_params(zs) log_prior = dirichlet.logpdf(pi, self.alpha) log_prior += tf.reduce_sum(norm.logpdf(mus, 0, np.sqrt(self.c))) log_prior += tf.reduce_sum(invgamma.logpdf(sigmas, self.a, self.b)) # Loop over each sample zs[s, :]. log_lik = [] N = get_dims(x)[0] n_samples = get_dims(pi)[0] for s in range(n_samples): # log-likelihood is # sum_{n=1}^N log sum_{k=1}^K exp( log pi_k + log N(x_n; mu_k, sigma_k) ) # Create a K x N matrix, whose entry (k, n) is # log pi_k + log N(x_n; mu_k, sigma_k). matrix = [] for k in range(self.K): matrix += [ tf.ones(N) * tf.log(pi[s, k]) + multivariate_normal.logpdf( x, mus[s, (k * self.D):((k + 1) * self.D)], sigmas[s, (k * self.D):((k + 1) * self.D)]) ] matrix = tf.pack(matrix) # log_sum_exp() along the rows is a vector, whose nth # element is the log-likelihood of data point x_n. vector = log_sum_exp(matrix, 0) # Sum over data points to get the full log-likelihood. log_lik_z = tf.reduce_sum(vector) log_lik += [log_lik_z] return log_prior + tf.pack(log_lik)
def log_prob(self, xs, zs): """Return a vector [log p(xs, zs[1,:]), ..., log p(xs, zs[S,:])].""" x = xs['x'] pi, mus, sigmas = zs log_prior = dirichlet.logpdf(pi, self.alpha) log_prior += tf.reduce_sum(norm.logpdf(mus, 0, np.sqrt(self.c)), 1) log_prior += tf.reduce_sum(invgamma.logpdf(sigmas, self.a, self.b), 1) # Loop over each sample zs[s, :]. log_lik = [] N = get_dims(x)[0] n_samples = get_dims(pi)[0] for s in range(n_samples): # log-likelihood is # sum_{n=1}^N log sum_{k=1}^K exp( log pi_k + log N(x_n; mu_k, sigma_k) ) # Create a K x N matrix, whose entry (k, n) is # log pi_k + log N(x_n; mu_k, sigma_k). matrix = [] for k in range(self.K): matrix += [tf.ones(N)*tf.log(pi[s, k]) + multivariate_normal.logpdf(x, mus[s, (k*self.D):((k+1)*self.D)], sigmas[s, (k*self.D):((k+1)*self.D)])] matrix = tf.pack(matrix) # log_sum_exp() along the rows is a vector, whose nth # element is the log-likelihood of data point x_n. vector = log_sum_exp(matrix, 0) # Sum over data points to get the full log-likelihood. log_lik_z = tf.reduce_sum(vector) log_lik += [log_lik_z] return log_prior + tf.pack(log_lik)
def log_prob(self, xs, zs): """Return scalar, the log joint density log p(xs, zs).""" x = xs["x"] pi, mus, sigmas = zs["pi"], zs["mu"], zs["sigma"] log_prior = dirichlet.logpdf(pi, self.alpha) log_prior += tf.reduce_sum(norm.logpdf(mus, 0.0, self.c)) log_prior += tf.reduce_sum(invgamma.logpdf(sigmas, self.a, self.b)) # log-likelihood is # sum_{n=1}^N log sum_{k=1}^K exp( log pi_k + log N(x_n; mu_k, sigma_k) ) # Create a K x N matrix, whose entry (k, n) is # log pi_k + log N(x_n; mu_k, sigma_k). N = get_dims(x)[0] matrix = [] for k in range(self.K): matrix += [ tf.ones(N) * tf.log(pi[k]) + multivariate_normal_diag.logpdf( x, mus[(k * self.D) : ((k + 1) * self.D)], sigmas[(k * self.D) : ((k + 1) * self.D)] ) ] matrix = tf.pack(matrix) # log_sum_exp() along the rows is a vector, whose nth # element is the log-likelihood of data point x_n. vector = log_sum_exp(matrix, 0) # Sum over data points to get the full log-likelihood. log_lik = tf.reduce_sum(vector) return log_prior + log_lik
def build_loss(self): """ Loss function to minimize, whose gradient is a stochastic gradient inspired by adaptive importance sampling. """ # loss = E_{q(z; lambda)} [ w_norm(z; lambda) * # ( log p(x, z) - log q(z; lambda) ) ] # where # w_norm(z; lambda) = w(z; lambda) / sum_z( w(z; lambda) ) # w(z; lambda) = p(x, z) / q(z; lambda) # # gradient = - E_{q(z; lambda)} [ w_norm(z; lambda) * # grad_{lambda} log q(z; lambda) ] x = self.data.sample(self.n_data) self.variational.set_params(self.variational.mapping(x)) q_log_prob = tf.zeros([self.n_minibatch], dtype=tf.float32) for i in range(self.variational.num_factors): q_log_prob += self.variational.log_prob_zi(i, self.samples) # 1/B sum_{b=1}^B grad_log_q * w_norm # = 1/B sum_{b=1}^B grad_log_q * exp{ log(w_norm) } log_w = self.model.log_prob(x, self.samples) - q_log_prob # normalized log importance weights log_w_norm = log_w - log_sum_exp(log_w) w_norm = tf.exp(log_w_norm) self.losses = w_norm * log_w return -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm))
def log_prob(self, xs, zs): """Return scalar, the log joint density log p(xs, zs).""" x = xs['x'] pi, mus, sigmas = zs['pi'], zs['mu'], zs['sigma'] log_prior = dirichlet.logpdf(pi, self.alpha) log_prior += tf.reduce_sum(norm.logpdf(mus, 0.0, self.c)) log_prior += tf.reduce_sum(invgamma.logpdf(sigmas, self.a, self.b)) # log-likelihood is # sum_{n=1}^N log sum_{k=1}^K exp( log pi_k + log N(x_n; mu_k, sigma_k) ) # Create a K x N matrix, whose entry (k, n) is # log pi_k + log N(x_n; mu_k, sigma_k). N = get_dims(x)[0] matrix = [] for k in range(self.K): matrix += [tf.ones(N) * tf.log(pi[k]) + multivariate_normal_diag.logpdf(x, mus[(k * self.D):((k + 1) * self.D)], sigmas[(k * self.D):((k + 1) * self.D)])] matrix = tf.pack(matrix) # log_sum_exp() along the rows is a vector, whose nth # element is the log-likelihood of data point x_n. vector = log_sum_exp(matrix, 0) # Sum over data points to get the full log-likelihood. log_lik = tf.reduce_sum(vector) return log_prior + log_lik
def build_loss(self): """ Loss function to minimize, whose gradient is a stochastic gradient inspired by adaptive importance sampling. loss = E_{p(z | x)} [ log p(z | x) - log q(z; lambda) ] is equivalent to minimizing E_{p(z | x)} [ log p(x, z) - log q(z; lambda) ] \approx 1/B sum_{b=1}^B w_norm(z^b; lambda) (log p(x, z^b) - log q(z^b; lambda)) with gradient \approx - 1/B sum_{b=1}^B w_norm(z^b; lambda) grad_{lambda} log q(z^b; lambda) where + z^b ~ q(z^b; lambda) + w_norm(z^b; lambda) = w(z^b; lambda) / sum_{b=1}^B w(z^b; lambda) + w(z^b; lambda) = p(x, z^b) / q(z^b; lambda) """ x = self.data.sample(self.n_data) z, self.samples = self.variational.sample(self.n_minibatch) q_log_prob = tf.zeros([self.n_minibatch], dtype=tf.float32) for i in range(self.variational.num_factors): q_log_prob += self.variational.log_prob_i(i, tf.stop_gradient(z)) # normalized importance weights log_w = self.model.log_prob(x, z) - q_log_prob log_w_norm = log_w - log_sum_exp(log_w) w_norm = tf.exp(log_w_norm) self.loss = tf.reduce_mean(w_norm * log_w) return -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm))
def test_accuracy(self): sess = tf.InteractiveSession() x = tf.constant([[-1.0], [-2.0], [-3.0], [-4.0]]) result = log_sum_exp(x) hand_derived_result = -0.5598103014388045 self.assertAlmostEqual(result.eval(), hand_derived_result)
def log_prob(self, xs, zs): """ Return scalar, the log joint density log p(xs, zs). Given n_minibatch data points, n_samples of variables Summing over the datapoints makes sense since the joint is the only place in the estiamtion of the gradient that has the data points, and its the log, so we can sum over them BUT summing over the variables doenst make sense,its supposed to be one at a time """ x = xs['x'] pi, mus, sigmas = zs['pi'], zs['mu'], zs['sigma'] # print(get_dims(x)) #[n_minibatch, D] # print(get_dims(pi)) #[K] # print(get_dims(mus)) #[K*D] # print(get_dims(sigmas)) #[K*D] log_prior = dirichlet.logpdf(pi, self.alpha) log_prior += tf.reduce_sum(norm.logpdf(mus, 0.0, self.c)) log_prior += tf.reduce_sum(invgamma.logpdf(sigmas, self.a, self.b)) # log-likelihood is # sum_{n=1}^N log sum_{k=1}^K exp( log pi_k + log N(x_n; mu_k, sigma_k) ) # Create a K x N matrix, whose entry (k, n) is # log pi_k + log N(x_n; mu_k, sigma_k). n_minibatch = get_dims(x)[ 0] #this is [n_minibatch, D], with [0] its just n_minibatch #OH I think they compute the matrix so that they can do log sum exp, since they need to find the max value matrix = [] for k in range(self.K): # bbbb = tf.log(pi[k]) # print(get_dims(bbbb)) # aaaa= multivariate_normal_diag.logpdf(x, mus[(k * self.D):((k + 1) * self.D)], sigmas[(k * self.D):((k + 1) * self.D)]) # print(get_dims(aaaa)) # fadad matrix += [ tf.ones(n_minibatch) * tf.log(pi[k]) + multivariate_normal_diag.logpdf( x, mus[(k * self.D):((k + 1) * self.D)], sigmas[(k * self.D):((k + 1) * self.D)]) ] matrix = tf.pack(matrix) # log_sum_exp() along the rows is a vector, whose nth # element is the log-likelihood of data point x_n. vector = log_sum_exp(matrix, 0) # Sum over data points to get the full log-likelihood. log_lik = tf.reduce_sum(vector) return log_prior + log_lik
def build_loss(self): """Loss function to minimize. Defines a stochastic gradient of .. math:: KL( p(z |x) || q(z) ) = E_{p(z | x)} [ \log p(z | x) - \log q(z; \lambda) ] based on importance sampling. Computed as .. math:: 1/B \sum_{b=1}^B [ w_{norm}(z^b; \lambda) * (\log p(x, z^b) - \log q(z^b; \lambda) ] where .. math:: z^b \sim q(z^b; \lambda) w_{norm}(z^b; \lambda) = w(z^b; \lambda) / \sum_{b=1}^B ( w(z^b; \lambda) ) w(z^b; \lambda) = p(x, z^b) / q(z^b; \lambda) which gives a gradient .. math:: - 1/B \sum_{b=1}^B w_{norm}(z^b; \lambda) \partial_{\lambda} \log q(z^b; \lambda) """ x = self.data.sample(self.n_data) self.zs = self.variational.sample(self.n_minibatch) z = self.zs # normalized importance weights q_log_prob = self.variational.log_prob(stop_gradient(z)) log_w = self.model.log_prob(x, z) - q_log_prob log_w_norm = log_w - log_sum_exp(log_w) w_norm = tf.exp(log_w_norm) self.loss = tf.reduce_mean(w_norm * log_w) return -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm))
def build_loss(self): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: KL( p(z |x) || q(z) ) = E_{p(z | x)} [ \log p(z | x) - \log q(z; \lambda) ] based on importance sampling. Computed as .. math:: 1/B \sum_{b=1}^B [ w_{norm}(z^b; \lambda) * (\log p(x, z^b) - \log q(z^b; \lambda) ] where .. math:: z^b \sim q(z^b; \lambda) w_{norm}(z^b; \lambda) = w(z^b; \lambda) / \sum_{b=1}^B ( w(z^b; \lambda) ) w(z^b; \lambda) = p(x, z^b) / q(z^b; \lambda) which gives a gradient .. math:: - 1/B \sum_{b=1}^B w_{norm}(z^b; \lambda) \partial_{\lambda} \log q(z^b; \lambda) """ x = self.data z = self.variational.sample(self.n_samples) # normalized importance weights q_log_prob = self.variational.log_prob(stop_gradient(z)) log_w = self.model.log_prob(x, z) - q_log_prob log_w_norm = log_w - log_sum_exp(log_w) w_norm = tf.exp(log_w_norm) self.loss = tf.reduce_mean(w_norm * log_w) return -tf.reduce_mean(q_log_prob * stop_gradient(w_norm))
def build_loss_and_gradients(self, var_list): """Build loss function .. math:: KL( p(z |x) || q(z) ) = E_{p(z | x)} [ \log p(z | x) - \log q(z; \lambda) ] and stochastic gradients based on importance sampling. The loss function can be estimated as .. math:: 1/B \sum_{b=1}^B [ w_{norm}(z^b; \lambda) * (\log p(x, z^b) - \log q(z^b; \lambda) ], where .. math:: z^b \sim q(z^b; \lambda), w_{norm}(z^b; \lambda) = w(z^b; \lambda) / \sum_{b=1}^B (w(z^b; \lambda)), w(z^b; \lambda) = p(x, z^b) / q(z^b; \lambda). This provides a gradient, .. math:: - 1/B \sum_{b=1}^B [ w_{norm}(z^b; \lambda) * \partial_{\lambda} \log q(z^b; \lambda) ]. """ p_log_prob = [0.0] * self.n_samples q_log_prob = [0.0] * self.n_samples for s in range(self.n_samples): z_sample = {} for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( qz.log_prob(tf.stop_gradient(z_sample[z]))) if self.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_sample for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope='inference_' + str(s)) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z in six.iterkeys(self.latent_vars): z_copy = copy(z, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum( z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum( x_copy.log_prob(dict_swap[x])) else: x = self.data p_log_prob[s] = self.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_log_prob = tf.pack(q_log_prob) log_w = p_log_prob - q_log_prob log_w_norm = log_w - log_sum_exp(log_w) w_norm = tf.exp(log_w_norm) loss = tf.reduce_mean(w_norm * log_w) grads = tf.gradients( -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm)), [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_loss_and_gradients(self, var_list): """Build loss function .. math:: \\text{KL}( p(z \mid x) || q(z) ) = \mathbb{E}_{p(z \mid x)} [ \log p(z \mid x) - \log q(z; \lambda) ] and stochastic gradients based on importance sampling. The loss function can be estimated as .. math:: \\frac{1}{B} \sum_{b=1}^B [ w_{norm}(z^b; \lambda) (\log p(x, z^b) - \log q(z^b; \lambda) ], where for :math:`z^b \sim q(z^b; \lambda)`, .. math:: w_{norm}(z^b; \lambda) = w(z^b; \lambda) / \sum_{b=1}^B w(z^b; \lambda) normalizes the importance weights, :math:`w(z^b; \lambda) = p(x, z^b) / q(z^b; \lambda)`. This provides a gradient, .. math:: - \\frac{1}{B} \sum_{b=1}^B [ w_{norm}(z^b; \lambda) \\nabla_{\lambda} \log q(z^b; \lambda) ]. """ p_log_prob = [0.0] * self.n_samples q_log_prob = [0.0] * self.n_samples for s in range(self.n_samples): scope = 'inference_' + str(id(self)) + '/' + str(s) z_sample = {} for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) z_sample[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( qz.log_prob(tf.stop_gradient(z_sample[z]))) if self.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_sample for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z in six.iterkeys(self.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(dict_swap[x])) else: x = self.data p_log_prob[s] = self.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_log_prob = tf.pack(q_log_prob) log_w = p_log_prob - q_log_prob log_w_norm = log_w - log_sum_exp(log_w) w_norm = tf.exp(log_w_norm) if var_list is None: var_list = tf.trainable_variables() loss = tf.reduce_mean(w_norm * log_w) grads = tf.gradients( -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm)), [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def test_log_sum_exp_2d(self): with self.test_session(): x = tf.constant([[-1.0], [-2.0], [-3.0], [-4.0]]) self.assertAllClose(log_sum_exp(x).eval(), -0.5598103014388045)
def build_loss(self): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: KL( p(z |x) || q(z) ) = E_{p(z | x)} [ \log p(z | x) - \log q(z; \lambda) ] based on importance sampling. Computed as .. math:: 1/B \sum_{b=1}^B [ w_{norm}(z^b; \lambda) * (\log p(x, z^b) - \log q(z^b; \lambda) ] where .. math:: z^b \sim q(z^b; \lambda) w_{norm}(z^b; \lambda) = w(z^b; \lambda) / \sum_{b=1}^B (w(z^b; \lambda)) w(z^b; \lambda) = p(x, z^b) / q(z^b; \lambda) which gives a gradient .. math:: - 1/B \sum_{b=1}^B w_{norm}(z^b; \lambda) \partial_{\lambda} \log q(z^b; \lambda) """ p_log_prob = [0.0] * self.n_samples q_log_prob = [0.0] * self.n_samples for s in range(self.n_samples): z_sample = {} for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( qz.log_prob(tf.stop_gradient(z_sample[z]))) if self.model_wrapper is None: for z in six.iterkeys(self.latent_vars): # Copy p(z), replacing any conditioning on prior with # conditioning on posterior sample. z_copy = copy(z, dict_swap=z_sample, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(z_copy.log_prob( z_sample[z])) for x, obs in six.iteritems(self.data): if isinstance(x, RandomVariable): # Copy p(x | z), replacing any conditioning on prior with # conditioning on posterior sample. x_copy = copy(x, dict_swap=z_sample, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(obs)) else: x = self.data p_log_prob[s] = self.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_log_prob = tf.pack(q_log_prob) log_w = p_log_prob - q_log_prob log_w_norm = log_w - log_sum_exp(log_w) w_norm = tf.exp(log_w_norm) self.loss = tf.reduce_mean(w_norm * log_w) return -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm))
def test_2d(): x = tf.constant([[-1.0], [-2.0], [-3.0], [-4.0]]) val_ed = log_sum_exp(x) val_true = -0.5598103014388045 assert np.allclose(val_ed.eval(), val_true)