def _log_joint(self, z_sample): """Utility function to calculate model's log joint density, log p(x, z), for inputs z (and fixed data x). Args: z_sample: dict. Latent variable keys to samples. """ self.scope_iter += 1 scope = 'inference_' + str(id(self)) + '/' + str(self.scope_iter) # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_sample.copy() for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx log_joint = 0.0 for z in six.iterkeys(self.latent_vars): z_copy = copy(z, dict_swap, scope=scope) log_joint += tf.reduce_sum(z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) log_joint += tf.reduce_sum(x_copy.log_prob(dict_swap[x])) return log_joint
def build_loss_and_gradients(self, var_list): """Build loss function. Its automatic differentiation is the gradient of $- \log p(x,z).$ """ # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = 'inference_' + str(id(self)) dict_swap = {z: qz.value() for z, qz in six.iteritems(self.latent_vars)} for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): dict_swap[x] = qx.value() else: dict_swap[x] = qx p_log_prob = 0.0 for z in six.iterkeys(self.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_prob += tf.reduce_sum( self.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_prob += tf.reduce_sum( self.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) loss = -p_log_prob grads = tf.gradients(loss, var_list) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_reparam_loss_and_gradients(inference, var_list): """Build loss function. Its automatic differentiation is a stochastic gradient of $-\\text{ELBO} = -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$ based on the reparameterization trick (Kingma and Welling, 2014). Computed by sampling from $q(z;\lambda)$ and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples q_log_prob = [0.0] * inference.n_samples base_scope = tf.get_default_graph().unique_name("inference") + '/' for s in range(inference.n_samples): # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = base_scope + tf.get_default_graph().unique_name("sample") dict_swap = {} for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) dict_swap[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * qz_copy.log_prob(dict_swap[z])) for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) p_log_prob = tf.reduce_mean(p_log_prob) q_log_prob = tf.reduce_mean(q_log_prob) if inference.logging: tf.summary.scalar("loss/p_log_prob", p_log_prob, collections=[inference._summary_key]) tf.summary.scalar("loss/q_log_prob", q_log_prob, collections=[inference._summary_key]) loss = -(p_log_prob - q_log_prob) grads = tf.gradients(loss, var_list) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_reparam_entropy_loss_and_gradients(inference, var_list): """Build loss function. Its automatic differentiation is a stochastic gradient of $-\\text{ELBO} = -( \mathbb{E}_{q(z; \lambda)} [ \log p(x , z) ] + \mathbb{H}(q(z; \lambda)) )$ based on the reparameterization trick (Kingma and Welling, 2014). It assumes the entropy is analytic. Computed by sampling from $q(z;\lambda)$ and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = 'inference_' + str(id(inference)) + '/' + str(s) dict_swap = {} for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) dict_swap[z] = qz_copy.value() for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) p_log_prob = tf.reduce_mean(p_log_prob) q_entropy = tf.reduce_sum([ qz.entropy() for z, qz in six.iteritems(inference.latent_vars)]) if inference.logging: summary_key = 'summaries_' + str(id(inference)) tf.summary.scalar("loss/p_log_prob", p_log_prob, collections=[summary_key]) tf.summary.scalar("loss/q_entropy", q_entropy, collections=[summary_key]) loss = -(p_log_prob + q_entropy) grads = tf.gradients(loss, var_list) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_score_loss_and_gradients(inference, var_list): """Build loss function and gradients based on the score function estimator (Paisley et al., 2012). Computed by sampling from $q(z;\lambda)$ and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples q_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = 'inference_' + str(id(inference)) + '/' + str(s) dict_swap = {} for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) dict_swap[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * qz_copy.log_prob(tf.stop_gradient(dict_swap[z]))) for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) p_log_prob = tf.stack(p_log_prob) q_log_prob = tf.stack(q_log_prob) if inference.logging: summary_key = 'summaries_' + str(id(inference)) tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob), collections=[summary_key]) tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob), collections=[summary_key]) losses = p_log_prob - q_log_prob loss = -tf.reduce_mean(losses) grads = tf.gradients( -tf.reduce_mean(q_log_prob * tf.stop_gradient(losses)), var_list) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_score_kl_loss_and_gradients(inference, var_list): """Build loss function and gradients based on the score function estimator (Paisley et al., 2012). It assumes the KL is analytic. Computed by sampling from $q(z;\lambda)$ and evaluating the expectation using Monte Carlo sampling. """ p_log_lik = [0.0] * inference.n_samples q_log_prob = [0.0] * inference.n_samples base_scope = tf.get_default_graph().unique_name("inference") + '/' for s in range(inference.n_samples): # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = base_scope + tf.get_default_graph().unique_name("sample") dict_swap = {} for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) dict_swap[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * qz_copy.log_prob(tf.stop_gradient(dict_swap[z]))) for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_lik[s] += tf.reduce_sum( inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) p_log_lik = tf.stack(p_log_lik) q_log_prob = tf.stack(q_log_prob) kl_penalty = tf.reduce_sum([ inference.kl_scaling.get(z, 1.0) * tf.reduce_sum(kl_divergence(qz, z)) for z, qz in six.iteritems(inference.latent_vars)]) if inference.logging: tf.summary.scalar("loss/p_log_lik", tf.reduce_mean(p_log_lik), collections=[inference._summary_key]) tf.summary.scalar("loss/kl_penalty", kl_penalty, collections=[inference._summary_key]) loss = -(tf.reduce_mean(p_log_lik) - kl_penalty) grads = tf.gradients( -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_lik)) - kl_penalty), var_list) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_score_loss_entropy(self): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -ELBO = - ( E_{q(z; \lambda)} [ \log p(x, z) ] + H(q(z; \lambda)) ) based on the score function estimator. (Paisley et al., 2012) It assumes the entropy is analytic. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * self.n_samples q_log_prob = [0.0] * self.n_samples for s in range(self.n_samples): z_sample = {} for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( qz.log_prob(tf.stop_gradient(z_sample[z]))) # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on posterior sample or # observed data. dict_swap = z_sample for x, obs in six.iteritems(self.data): if isinstance(x, RandomVariable): dict_swap[x] = obs if self.model_wrapper is None: for z in six.iterkeys(self.latent_vars): z_copy = copy(z, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(z_sample[z])) for x, obs in six.iteritems(self.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(obs)) else: x = self.data p_log_prob[s] = self.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_log_prob = tf.pack(q_log_prob) q_entropy = tf.reduce_sum([qz.entropy() for qz in six.itervalues(self.latent_vars)]) self.loss = tf.reduce_mean(p_log_prob) + q_entropy return -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_prob)) + q_entropy)
def build_reparam_entropy_loss_and_gradients(inference, var_list): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -\\text{ELBO} = -( \mathbb{E}_{q(z; \lambda)} [ \log p(x , z) ] + \mathbb{H}(q(z; \lambda)) ) based on the reparameterization trick (Kingma and Welling, 2014). It assumes the entropy is analytic. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = 'inference_' + str(id(inference)) + '/' + str(s) dict_swap = {} for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) dict_swap[z] = qz_copy.value() for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) p_log_prob = tf.stack(p_log_prob) q_entropy = tf.reduce_sum( [qz.entropy() for z, qz in six.iteritems(inference.latent_vars)]) loss = -(tf.reduce_mean(p_log_prob) + q_entropy) grads = tf.gradients(loss, var_list) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_score_entropy_loss_and_gradients(inference, var_list): """Build loss function and gradients based on the score function estimator (Paisley et al., 2012). It assumes the entropy is analytic. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples q_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = 'inference_' + str(id(inference)) + '/' + str(s) dict_swap = {} for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) dict_swap[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * qz_copy.log_prob(tf.stop_gradient(dict_swap[z]))) for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) p_log_prob = tf.stack(p_log_prob) q_log_prob = tf.stack(q_log_prob) q_entropy = tf.reduce_sum([ qz.entropy() for z, qz in six.iteritems(inference.latent_vars)]) loss = -(tf.reduce_mean(p_log_prob) + q_entropy) grads = tf.gradients( -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_prob)) + q_entropy), [v._ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_reparam_entropy_loss(inference): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -ELBO = -( E_{q(z; \lambda)} [ \log p(x , z) ] + H(q(z; \lambda)) ) based on the reparameterization trick (Kingma and Welling, 2014). It assumes the entropy is analytic. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): z_sample = {} for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() if inference.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_sample for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope='inference_' + str(s)) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum( x_copy.log_prob(dict_swap[x])) else: x = inference.data p_log_prob[s] = inference.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_entropy = tf.reduce_sum( [qz.entropy() for qz in six.itervalues(inference.latent_vars)]) loss = -(tf.reduce_mean(p_log_prob) + q_entropy) return loss
def build_reparam_loss_kl(self): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -ELBO = - ( E_{q(z; \lambda)} [ \log p(x | z) ] + KL(q(z; \lambda) || p(z)) ) based on the reparameterization trick. (Kingma and Welling, 2014) It assumes the KL is analytic. For model wrappers, it assumes the prior is :math:`p(z) = \mathcal{N}(z; 0, 1)`. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_lik = [0.0] * self.n_samples for s in range(self.n_samples): z_sample = {} for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() if self.model_wrapper is None: for x, obs in six.iteritems(self.data): if isinstance(x, RandomVariable): # Copy p(x | z), replacing any conditioning on prior with # conditioning on posterior sample. x_copy = copy(x, dict_swap=z_sample, scope='inference_' + str(s)) p_log_lik[s] += tf.reduce_sum(x_copy.log_prob(obs)) else: x = self.data p_log_lik[s] = self.model_wrapper.log_lik(x, z_sample) p_log_lik = tf.pack(p_log_lik) if self.model_wrapper is None: kl = tf.reduce_sum([ kl_multivariate_normal(qz.mu, qz.sigma, z.mu, z.sigma) for z, qz in six.iteritems(self.latent_vars) ]) else: kl = tf.reduce_sum([ kl_multivariate_normal(qz.mu, qz.sigma) for qz in six.itervalues(self.latent_vars) ]) p_log_lik = tf.pack(p_log_lik) self.loss = tf.reduce_mean(p_log_lik) - kl return -self.loss
def build_score_entropy_loss_and_gradients(inference, scope=None): """Build loss function and gradients based on the score function estimator (Paisley et al., 2012). It assumes the entropy is analytic. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples q_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): z_sample = {} for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( qz.log_prob(tf.stop_gradient(z_sample[z]))) if inference.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on posterior sample or # observed data. dict_swap = z_sample for x, obs in six.iteritems(inference.data): if isinstance(x, RandomVariable): dict_swap[x] = obs for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(z_sample[z])) for x, obs in six.iteritems(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(obs)) else: x = inference.data p_log_prob[s] = inference.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_log_prob = tf.pack(q_log_prob) q_entropy = tf.reduce_sum( [qz.entropy() for qz in six.itervalues(inference.latent_vars)]) loss = -(tf.reduce_mean(p_log_prob) + q_entropy) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) grads = tf.gradients( -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_prob)) + q_entropy), [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_reparam_kl_loss_and_gradients(inference, var_list): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -\\text{ELBO} = - ( \mathbb{E}_{q(z; \lambda)} [ \log p(x \mid z) ] + \\text{KL}(q(z; \lambda) \| p(z)) ) based on the reparameterization trick [@kingma2014auto]. It assumes the KL is analytic. Computed by sampling from $q(z;\lambda)$ and evaluating the expectation using Monte Carlo sampling. """ p_log_lik = [0.0] * inference.n_samples base_scope = tf.get_default_graph().unique_name("inference") + '/' for s in range(inference.n_samples): # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = base_scope + tf.get_default_graph().unique_name("sample") dict_swap = {} for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) dict_swap[z] = qz_copy.value() for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_lik[s] += tf.reduce_sum( inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) p_log_lik = tf.reduce_mean(p_log_lik) kl_penalty = tf.reduce_sum([ tf.reduce_sum(inference.kl_scaling.get(z, 1.0) * kl_divergence(qz, z)) for z, qz in six.iteritems(inference.latent_vars)]) if inference.logging: tf.summary.scalar("loss/p_log_lik", p_log_lik, collections=[inference._summary_key]) tf.summary.scalar("loss/kl_penalty", kl_penalty, collections=[inference._summary_key]) loss = -(p_log_lik - kl_penalty) grads = tf.gradients(loss, var_list) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_score_loss_and_gradients(inference, var_list): """Build loss function and gradients based on the score function estimator (Paisley et al., 2012). Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples q_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): z_sample = {} for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( qz.log_prob(tf.stop_gradient(z_sample[z]))) if inference.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_sample for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope='inference_' + str(s)) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(dict_swap[z])) for x in six.iteritems(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum( x_copy.log_prob(dict_swap[x])) else: x = inference.data p_log_prob[s] = inference.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_log_prob = tf.pack(q_log_prob) losses = p_log_prob - q_log_prob loss = -tf.reduce_mean(losses) grads = tf.gradients( -tf.reduce_mean(q_log_prob * tf.stop_gradient(losses)), [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_reparam_loss_entropy(self): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -ELBO = -( E_{q(z; \lambda)} [ \log p(x , z) ] + H(q(z; \lambda)) ) based on the reparameterization trick. (Kingma and Welling, 2014) It assumes the entropy is analytic. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * self.n_samples for s in range(self.n_samples): z_sample = {} for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() if self.model_wrapper is None: for z in six.iterkeys(self.latent_vars): # Copy p(z), replacing any conditioning on prior with # conditioning on posterior sample. z_copy = copy(z, dict_swap=z_sample, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(z_copy.log_prob( z_sample[z])) for x, obs in six.iteritems(self.data): if isinstance(x, RandomVariable): # Copy p(x | z), replacing any conditioning on prior with # conditioning on posterior sample. x_copy = copy(x, dict_swap=z_sample, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(obs)) else: x = self.data p_log_prob[s] = self.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_entropy = tf.reduce_sum( [qz.entropy() for qz in six.itervalues(self.latent_vars)]) self.loss = tf.reduce_mean(p_log_prob) + q_entropy return -self.loss
def build_score_loss(self): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -ELBO = -E_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ] based on the score function estimator. (Paisley et al., 2012) Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * self.n_samples q_log_prob = [0.0] * self.n_samples for s in range(self.n_samples): z_sample = {} for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( qz.log_prob(tf.stop_gradient(z_sample[z]))) if self.model_wrapper is None: for z in six.iterkeys(self.latent_vars): # Copy p(z), replacing any conditioning on prior with # conditioning on posterior sample. z_copy = copy(z, dict_swap=z_sample, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(z_copy.log_prob( z_sample[z])) for x, obs in six.iteritems(self.data): if isinstance(x, RandomVariable): # Copy p(x | z), replacing any conditioning on prior with # conditioning on posterior sample. x_copy = copy(x, dict_swap=z_sample, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(obs)) else: x = self.data p_log_prob[s] = self.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_log_prob = tf.pack(q_log_prob) losses = p_log_prob - q_log_prob self.loss = tf.reduce_mean(losses) return -tf.reduce_mean(q_log_prob * tf.stop_gradient(losses))
def build_loss_and_gradients(self, var_list): """Build loss function. Its automatic differentiation is the gradient of .. math:: - \log p(x,z) """ # for now: ignore var_list z_mode = {z: qz.value() for z, qz in six.iteritems(self.latent_vars)} if self.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_mode for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): dict_swap[x] = qx.value() else: dict_swap[x] = qx scope = 'inference_' + str(id(self)) p_log_prob = 0.0 for z in six.iterkeys(self.latent_vars): z_copy = copy(z, dict_swap, scope=scope) z_log_prob = tf.reduce_sum(z_copy.log_prob(dict_swap[z])) if z in self.scale: z_log_prob *= self.scale[z] p_log_prob += z_log_prob for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) x_log_prob = tf.reduce_sum(x_copy.log_prob(dict_swap[x])) if x in self.scale: x_log_prob *= self.scale[x] p_log_prob += x_log_prob else: x = self.data p_log_prob = self.model_wrapper.log_prob(x, z_mode) loss = -p_log_prob if var_list is None: var_list = tf.trainable_variables() grads = tf.gradients(loss, [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_loss_and_gradients(self): """Build loss function. Its automatic differentiation is the gradient of .. math:: - \log p(x,z) """ z_mode = {z: qz.value() for z, qz in six.iteritems(self.latent_vars)} if self.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_mode for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): dict_swap[x] = qx.value() else: dict_swap[x] = qx scope = 'inference_' + str(id(self)) p_log_prob = 0.0 for z in six.iterkeys(self.latent_vars): z_copy = copy(z, dict_swap, scope=scope) z_log_prob = tf.reduce_sum(z_copy.log_prob(dict_swap[z])) if z in self.scale: z_log_prob *= self.scale[z] p_log_prob += z_log_prob for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) x_log_prob = tf.reduce_sum(x_copy.log_prob(dict_swap[x])) if x in self.scale: x_log_prob *= self.scale[x] p_log_prob += x_log_prob else: x = self.data p_log_prob = self.model_wrapper.log_prob(x, z_mode) loss = -p_log_prob if var_list is None: var_list = tf.trainable_variables() grads = tf.gradients(loss, [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_reparam_loss(inference): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -ELBO = -E_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ] based on the reparameterization trick. (Kingma and Welling, 2014) Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples q_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): z_sample = {} for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum(qz.log_prob(z_sample[z])) # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on posterior sample or # observed data. dict_swap = z_sample for x, obs in six.iteritems(inference.data): if isinstance(x, RandomVariable): dict_swap[x] = obs if inference.model_wrapper is None: for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(z_sample[z])) for x, obs in six.iteritems(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(obs)) else: x = inference.data p_log_prob[s] = inference.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_log_prob = tf.pack(q_log_prob) inference.loss = -tf.reduce_mean(p_log_prob - q_log_prob) return inference.loss
def test_placeholder(self): with self.test_session() as sess: x = tf.placeholder(tf.float32, name="CustomName") y = tf.constant(3.0) z = x * y z_new = copy(z) self.assertEqual(sess.run(z_new, feed_dict={x: 4.0}), 12.0)
def test_scan(self): with self.test_session(): set_seed(42) op = tf.scan(lambda a, x: a + x, tf.constant([2.0, 3.0, 1.0])) self.assertAllClose(op.eval(), [2.0, 5.0, 6.0]) self.assertAllClose(copy(op).eval(), [2.0, 5.0, 6.0])
def test_variable(self): with self.test_session() as sess: x = tf.Variable(2.0, name="CustomName") y = tf.constant(3.0) z = x * y z_new = copy(z) tf.initialize_variables([x]).run() self.assertEqual(z_new.eval(), 6.0)
def test_placeholder_tensor(self): with self.test_session(): x = tf.placeholder(tf.float32, name="CustomName") y = tf.constant(3.0) z = x * y qx = tf.constant(4.0) z_new = copy(z, {x: qx}) self.assertEqual(z_new.eval(), 12.0)
def test_tensor_tensor(self): with self.test_session(): x = tf.constant(2.0) y = tf.constant(3.0) z = x * y qx = tf.constant(4.0) z_new = copy(z, {x: qx}) self.assertEqual(z_new.eval(), 12.0)
def test_variable(self): with self.test_session() as sess: x = tf.Variable(2.0, name="CustomName") y = tf.constant(3.0) z = x * y z_new = copy(z) tf.variables_initializer([x]).run() self.assertEqual(z_new.eval(), 6.0)
def _log_joint(self, z_sample): """ Utility function to calculate model's log joint density, log p(x, z), for inputs z (and fixed data x). Parameters ---------- z_sample : dict Latent variable keys to samples. """ if self.model_wrapper is None: scope = 'inference_' + str(id(self)) # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_sample.copy() for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx log_joint = 0.0 for z in six.iterkeys(self.latent_vars): z_copy = copy(z, dict_swap, scope=scope) z_log_prob = tf.reduce_sum(z_copy.log_prob(dict_swap[z])) if z in self.scale: z_log_prob *= self.scale[z] log_joint += z_log_prob for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) x_log_prob = tf.reduce_sum(x_copy.log_prob(dict_swap[x])) if x in self.scale: x_log_prob *= self.scale[x] log_joint += x_log_prob else: x = self.data log_joint = self.model_wrapper.log_prob(x, z_sample) return log_joint
def test_tensor_variable(self): with self.test_session() as sess: x = tf.constant(2.0) y = tf.constant(3.0) z = x * y qx = tf.Variable(4.0, name="CustomName") z_new = copy(z, {x: qx}) tf.variables_initializer([qx]).run() self.assertEqual(z_new.eval(), 12.0)
def test_dict_rv_tensor(self): with self.test_session(): set_seed(289362) x = Normal(mu=0.0, sigma=0.1) y = tf.constant(1.0) z = x * y qx = Normal(mu=10.0, sigma=0.1) z_new = copy(z, {x: qx.value()}) self.assertGreater(z_new.eval(), 5.0)
def test_dict_tensor_rv(self): with self.test_session(): set_seed(95258) x = Normal(mu=0.0, sigma=0.1) y = tf.constant(1.0) z = x * y qx = Normal(mu=10.0, sigma=0.1) z_new = copy(z, {x.value(): qx}) self.assertGreater(z_new.eval(), 5.0)
def test_scan_random(self): with self.test_session() as session: set_seed(1234) op = tf.scan(lambda a, x: a + x, tf.random_normal([3])) copy_op = copy(op) result = session.run([copy_op, copy_op, op, op]) self.assertAllClose(result[0], result[1]) self.assertAllClose(result[2], result[3])
def test_dict_rv_rv(self): with self.test_session(): set_seed(325135) x = Normal(mu=0.0, sigma=0.1) y = tf.constant(1.0) z = x * y qx = Normal(mu=10.0, sigma=0.1) z_new = copy(z, {x: qx}) self.assertGreater(z_new.eval(), 5.0)
def test_tensor_variable(self): with self.test_session() as sess: x = tf.constant(2.0) y = tf.constant(3.0) z = x * y qx = tf.Variable(4.0, name="CustomName") z_new = copy(z, {x: qx}) tf.initialize_variables([qx]).run() self.assertEqual(z_new.eval(), 12.0)
def build_loss_and_gradients(self, var_list): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -E_{q(z^1; \lambda), ..., q(z^K; \lambda)} [ \log 1/K \sum_{k=1}^K p(x, z^k)/q(z^k; \lambda) ] based on the reparameterization trick. """ # Form vector of K log importance weights. log_w = [] for k in range(self.K): scope = 'inference_' + str(id(self)) + '/' + str(k) z_sample = {} q_log_prob = 0.0 for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) z_sample[z] = qz_copy q_log_prob += tf.reduce_sum(qz_copy.log_prob(qz_copy)) p_log_prob = 0.0 for z in six.iterkeys(self.latent_vars): # Copy p(z), swapping its conditioning set with samples # from variational distribution. z_copy = copy(z, z_sample, scope=scope) p_log_prob += tf.reduce_sum(z_copy.log_prob(z_sample[z])) for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): # Copy p(x | z), swapping its conditioning set with samples # from variational distribution. x_copy = copy(x, z_sample, scope=scope) p_log_prob += tf.reduce_sum(x_copy.log_prob(qx)) log_w += [p_log_prob - q_log_prob] loss = -reduce_logmeanexp(log_w) grads = tf.gradients(loss, [v._ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def test_list(self): with self.test_session() as sess: x = Normal(mu=tf.constant(0.0), sigma=tf.constant(0.1)) y = Normal(mu=tf.constant(10.0), sigma=tf.constant(0.1)) cat = Categorical(logits=tf.zeros(5)) components = [Normal(mu=x, sigma=tf.constant(0.1)) for _ in range(5)] z = Mixture(cat=cat, components=components) z_new = copy(z, {x: y.value()}) self.assertGreater(z_new.value().eval(), 5.0)
def build_loss_and_gradients(self, var_list): """Build loss function. Its automatic differentiation is the gradient of $- \log p(x,z).$ """ # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = tf.get_default_graph().unique_name("inference") dict_swap = { z: qz.value() for z, qz in six.iteritems(self.latent_vars) } for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): dict_swap[x] = qx.value() else: dict_swap[x] = qx p_log_prob = 0.0 for z in six.iterkeys(self.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_prob += tf.reduce_sum( self.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): if dict_swap: x_copy = copy(x, dict_swap, scope=scope) else: x_copy = x p_log_prob += tf.reduce_sum( self.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses()) loss = -p_log_prob + reg_penalty grads = tf.gradients(loss, var_list) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def _replica_ratio(self, replica_ratio, replica_sample): replica_ratio = tf.assign( replica_ratio, tf.zeros(self.n_replica, dtype=list(self.latent_vars)[0].dtype)) dict_swap = {} for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope='conditional') dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for i in range(self.n_replica): dict_swap_i = dict_swap.copy() dict_swap_i.update(replica_sample[i]) base_scope = tf.get_default_graph().unique_name("inference") + '/' scope_i = base_scope + '_%d' % i for z in six.iterkeys(self.latent_vars): # Build priors p(z_i) and p(z_j). z_i = copy(z, dict_swap_i, scope=scope_i) # Increment ratio. replica_ratio = tf.scatter_update( replica_ratio, i, replica_ratio[i] + tf.reduce_sum(z_i.log_prob(dict_swap_i[z]))) for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): # Build likelihoods p(x | z_i) and p(x | z_j). x_z_i = copy(x, dict_swap_i, scope=scope_i) # Increment ratio. replica_ratio = tf.scatter_update( replica_ratio, i, replica_ratio[i] + tf.reduce_sum(x_z_i.log_prob(dict_swap[x]))) return replica_ratio
def build_loss_and_gradients(self, var_list): """Build loss function. Its automatic differentiation is the gradient of .. math:: - \log p(x,z) """ # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = 'inference_' + str(id(self)) dict_swap = { z: qz.value() for z, qz in six.iteritems(self.latent_vars) } for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): dict_swap[x] = qx.value() else: dict_swap[x] = qx p_log_prob = 0.0 for z in six.iterkeys(self.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_prob += tf.reduce_sum( self.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_prob += tf.reduce_sum( self.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) loss = -p_log_prob grads = tf.gradients(loss, [v._ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def test_queue(self): with self.test_session() as sess: tensor = tf.constant([0.0, 1.0, 2.0, 3.0]) x = tf.train.batch([tensor], batch_size=2, enqueue_many=True, name='CustomName') y = tf.constant(3.0) z = x * y z_new = copy(z) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) self.assertAllEqual(sess.run(z_new), np.array([0.0, 3.0])) self.assertAllEqual(sess.run(z_new), np.array([6.0, 9.0])) coord.request_stop() coord.join(threads)
def _replica_ratio(self, replica_ratio, replica_sample): replica_ratio = tf.assign(replica_ratio, tf.zeros( self.n_replica, dtype=list(self.latent_vars)[0].dtype)) dict_swap = {} for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope='conditional') dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for i in range(self.n_replica): dict_swap_i = dict_swap.copy() dict_swap_i.update(replica_sample[i]) base_scope = tf.get_default_graph().unique_name("inference") + '/' scope_i = base_scope + '_%d' % i for z in six.iterkeys(self.latent_vars): # Build priors p(z_i) and p(z_j). z_i = copy(z, dict_swap_i, scope=scope_i) # Increment ratio. replica_ratio = tf.scatter_update( replica_ratio, i, replica_ratio[i] + tf.reduce_sum(z_i.log_prob(dict_swap_i[z]))) for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): # Build likelihoods p(x | z_i) and p(x | z_j). x_z_i = copy(x, dict_swap_i, scope=scope_i) # Increment ratio. replica_ratio = tf.scatter_update( replica_ratio, i, replica_ratio[i] + tf.reduce_sum(x_z_i.log_prob(dict_swap[x]))) return replica_ratio
def build_loss_and_gradients(self, var_list): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -E_{q(z^1; \lambda), ..., q(z^K; \lambda)} [ \log 1/K \sum_{k=1}^K p(x, z^k)/q(z^k; \lambda) ] based on the score function estimator. (Paisley et al., 2012) Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. Note there is a difference between the number of samples to approximate the expectations (`n_samples`) and the number of importance samples to determine how many expectations (`K`). """ x = self.data # Form n_samples x K matrix of log importance weights. log_w = [] for s in range(self.n_samples * self.K): z_sample = {} q_log_prob = 0.0 for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() q_log_prob += tf.reduce_sum( qz.log_prob(tf.stop_gradient(z_sample[z]))) p_log_prob = self.model_wrapper.log_prob(x, z_sample) log_w += [p_log_prob - q_log_prob] log_w = tf.reshape(log_w, [self.n_samples, self.K]) # Take log mean exp across importance weights (columns). losses = log_mean_exp(log_w, 1) loss = -tf.reduce_mean(losses) if var_list is None: var_list = tf.trainable_variables() grads = tf.gradients( -tf.reduce_mean(q_log_prob * tf.stop_gradient(losses)), [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_loss_and_gradients(self, var_list): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -E_{q(z^1; \lambda), ..., q(z^K; \lambda)} [ \log 1/K \sum_{k=1}^K p(x, z^k)/q(z^k; \lambda) ] based on the score function estimator. (Paisley et al., 2012) Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. Note there is a difference between the number of samples to approximate the expectations (`n_samples`) and the number of importance samples to determine how many expectations (`K`). """ x = self.data # Form n_samples x K matrix of log importance weights. log_w = [] for s in range(self.n_samples * self.K): z_sample = {} q_log_prob = 0.0 for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() q_log_prob += tf.reduce_sum(qz.log_prob(tf.stop_gradient(z_sample[z]))) p_log_prob = self.model_wrapper.log_prob(x, z_sample) log_w += [p_log_prob - q_log_prob] log_w = tf.reshape(log_w, [self.n_samples, self.K]) # Take log mean exp across importance weights (columns). losses = log_mean_exp(log_w, 1) loss = -tf.reduce_mean(losses) if var_list is None: var_list = tf.trainable_variables() grads = tf.gradients( -tf.reduce_mean(q_log_prob * tf.stop_gradient(losses)), [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_reparam_loss(self): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -E_{q(z^1; \lambda), ..., q(z^K; \lambda)} [ \log 1/K \sum_{k=1}^K p(x, z^k)/q(z^k; \lambda) ] based on the reparameterization trick. (Kingma and Welling, 2014) Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. Note there is a difference between the number of samples to approximate the expectations (`n_samples`) and the number of importance samples to determine how many expectations (`K`). """ x = self.data # Form n_samples x K matrix of log importance weights. log_w = [] for s in range(self.n_samples * self.K): z_sample = {} q_log_prob = 0.0 for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() q_log_prob += tf.reduce_sum(qz.log_prob(z_sample[z])) p_log_prob = self.model_wrapper.log_prob(x, z_sample) log_w += [p_log_prob - q_log_prob] log_w = tf.reshape(log_w, [self.n_samples, self.K]) # Take log mean exp across importance weights (columns). losses = log_mean_exp(log_w, 1) self.loss = tf.reduce_mean(losses) return -self.loss
def build_reparam_kl_loss_and_gradients(inference, var_list): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -\\text{ELBO} = - ( \mathbb{E}_{q(z; \lambda)} [ \log p(x \mid z) ] + \\text{KL}(q(z; \lambda) \| p(z)) ) based on the reparameterization trick [@kingma2014auto]. It assumes the KL is analytic. Computed by sampling from $q(z;\lambda)$ and evaluating the expectation using Monte Carlo sampling. """ p_log_lik = [0.0] * inference.n_samples base_scope = tf.get_default_graph().unique_name("inference") + '/' for s in range(inference.n_samples): # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = base_scope + tf.get_default_graph().unique_name("sample") dict_swap = {} for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) dict_swap[z] = qz_copy.value() for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_lik[s] += tf.reduce_sum( inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) p_log_lik = tf.reduce_mean(p_log_lik) kl_penalty = tf.reduce_sum([ tf.reduce_sum(inference.kl_scaling.get(z, 1.0) * kl_divergence(qz, z)) for z, qz in six.iteritems(inference.latent_vars)]) reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses()) if inference.logging: tf.summary.scalar("loss/p_log_lik", p_log_lik, collections=[inference._summary_key]) tf.summary.scalar("loss/kl_penalty", kl_penalty, collections=[inference._summary_key]) tf.summary.scalar("loss/reg_penalty", reg_penalty, collections=[inference._summary_key]) loss = -(p_log_lik - kl_penalty - reg_penalty) grads = tf.gradients(loss, var_list) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def _test_copy(self, RV, value, *args, **kwargs): rv1 = RV(*args, value=value, **kwargs) rv2 = copy(rv1) value_shape1 = rv1.value().shape value_shape2 = rv2.value().shape self.assertEqual(value_shape1, value_shape2)
def build_loss_and_gradients(self, var_list): """Build loss function .. math:: \\text{KL}( p(z \mid x) || q(z) ) = \mathbb{E}_{p(z \mid x)} [ \log p(z \mid x) - \log q(z; \lambda) ] and stochastic gradients based on importance sampling. The loss function can be estimated as .. math:: \\frac{1}{B} \sum_{b=1}^B [ w_{norm}(z^b; \lambda) (\log p(x, z^b) - \log q(z^b; \lambda) ], where for :math:`z^b \sim q(z^b; \lambda)`, .. math:: w_{norm}(z^b; \lambda) = w(z^b; \lambda) / \sum_{b=1}^B w(z^b; \lambda) normalizes the importance weights, :math:`w(z^b; \lambda) = p(x, z^b) / q(z^b; \lambda)`. This provides a gradient, .. math:: - \\frac{1}{B} \sum_{b=1}^B [ w_{norm}(z^b; \lambda) \\nabla_{\lambda} \log q(z^b; \lambda) ]. """ p_log_prob = [0.0] * self.n_samples q_log_prob = [0.0] * self.n_samples for s in range(self.n_samples): scope = 'inference_' + str(id(self)) + '/' + str(s) z_sample = {} for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) z_sample[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( qz.log_prob(tf.stop_gradient(z_sample[z]))) if self.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_sample for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z in six.iterkeys(self.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(dict_swap[x])) else: x = self.data p_log_prob[s] = self.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_log_prob = tf.pack(q_log_prob) log_w = p_log_prob - q_log_prob log_w_norm = log_w - log_sum_exp(log_w) w_norm = tf.exp(log_w_norm) if var_list is None: var_list = tf.trainable_variables() loss = tf.reduce_mean(w_norm * log_w) grads = tf.gradients( -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm)), [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_update(self): """Draw sample from proposal conditional on last sample. Then accept or reject the sample based on the ratio, $\\text{ratio} = \log p(x, z^{\\text{new}}) - \log p(x, z^{\\text{old}}) + \log g(z^{\\text{new}} \mid z^{\\text{old}}) - \log g(z^{\\text{old}} \mid z^{\\text{new}})$ #### Notes The updates assume each Empirical random variable is directly parameterized by `tf.Variable`s. """ old_sample = {z: tf.gather(qz.params, tf.maximum(self.t - 1, 0)) for z, qz in six.iteritems(self.latent_vars)} old_sample = OrderedDict(old_sample) # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = {} for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope='conditional') dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx dict_swap_old = dict_swap.copy() dict_swap_old.update(old_sample) scope_old = 'inference_' + str(id(self)) + '/old' scope_new = 'inference_' + str(id(self)) + '/new' # Draw proposed sample and calculate acceptance ratio. new_sample = old_sample.copy() # copy to ensure same order ratio = 0.0 for z, proposal_z in six.iteritems(self.proposal_vars): # Build proposal g(znew | zold). proposal_znew = copy(proposal_z, dict_swap_old, scope=scope_old) # Sample znew ~ g(znew | zold). new_sample[z] = proposal_znew.value() # Increment ratio. ratio += tf.reduce_sum(proposal_znew.log_prob(new_sample[z])) dict_swap_new = dict_swap.copy() dict_swap_new.update(new_sample) for z, proposal_z in six.iteritems(self.proposal_vars): # Build proposal g(zold | znew). proposal_zold = copy(proposal_z, dict_swap_new, scope=scope_new) # Increment ratio. ratio -= tf.reduce_sum(proposal_zold.log_prob(dict_swap_old[z])) for z in six.iterkeys(self.latent_vars): # Build priors p(znew) and p(zold). znew = copy(z, dict_swap_new, scope=scope_new) zold = copy(z, dict_swap_old, scope=scope_old) # Increment ratio. ratio += tf.reduce_sum(znew.log_prob(dict_swap_new[z])) ratio -= tf.reduce_sum(zold.log_prob(dict_swap_old[z])) for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): # Build likelihoods p(x | znew) and p(x | zold). x_znew = copy(x, dict_swap_new, scope=scope_new) x_zold = copy(x, dict_swap_old, scope=scope_old) # Increment ratio. ratio += tf.reduce_sum(x_znew.log_prob(dict_swap[x])) ratio -= tf.reduce_sum(x_zold.log_prob(dict_swap[x])) # Accept or reject sample. u = Uniform().sample() accept = tf.log(u) < ratio sample_values = tf.cond(accept, lambda: list(six.itervalues(new_sample)), lambda: list(six.itervalues(old_sample))) if not isinstance(sample_values, list): # `tf.cond` returns tf.Tensor if output is a list of size 1. sample_values = [sample_values] sample = {z: sample_value for z, sample_value in zip(six.iterkeys(new_sample), sample_values)} # Update Empirical random variables. assign_ops = [] for z, qz in six.iteritems(self.latent_vars): variable = qz.get_variables()[0] assign_ops.append(tf.scatter_update(variable, self.t, sample[z])) # Increment n_accept (if accepted). assign_ops.append(self.n_accept.assign_add(tf.where(accept, 1, 0))) return tf.group(*assign_ops)
def complete_conditional(rv, cond_set=None): """Returns the conditional distribution `RandomVariable` $p(\\text{rv}\mid \cdot)$. This function tries to infer the conditional distribution of `rv` given `cond_set`, a set of other `RandomVariable`s in the graph. It will only be able to do this if 1. $p(\\text{rv}\mid \\text{cond\_set})$ is in a tractable exponential family; and 2. the truth of assumption 1 is not obscured in the TensorFlow graph. In other words, this function will do its best to recognize conjugate relationships when they exist. But it may not always be able to do the necessary algebra. Args: rv: RandomVariable. The random variable whose conditional distribution we are interested in. cond_set: iterable of RandomVariable, optional. The set of random variables we want to condition on. Default is all random variables in the graph. (It makes no difference if `cond_set` does or does not include `rv`.) #### Notes When calling `complete_conditional()` multiple times, one should usually pass an explicit `cond_set`. Otherwise `complete_conditional()` will try to condition on the `RandomVariable`s returned by previous calls to itself. This may result in unpredictable behavior. """ if cond_set is None: # Default to Markov blanket, excluding conditionals. This is useful if # calling complete_conditional many times without passing in cond_set. cond_set = get_blanket(rv) cond_set = [i for i in cond_set if not ('complete_conditional' in i.name and 'cond_dist' in i.name)] cond_set = set([rv] + list(cond_set)) with tf.name_scope('complete_conditional_%s' % rv.name) as scope: # log_joint holds all the information we need to get a conditional. log_joint = get_log_joint(cond_set) # Pull out the nodes that are nonlinear functions of rv into s_stats. stop_nodes = set([i.value() for i in cond_set]) subgraph = extract_subgraph(log_joint, stop_nodes) s_stats = suff_stat_nodes(subgraph, rv.value(), cond_set) s_stats = list(set(s_stats)) # Simplify those nodes, and put any new linear terms into multipliers_i. s_stat_exprs = defaultdict(list) for s_stat in s_stats: expr = symbolic_suff_stat(s_stat, rv.value(), stop_nodes) expr = full_simplify(expr) multipliers_i, s_stats_i = extract_s_stat_multipliers(expr) s_stat_exprs[s_stats_i].append( (s_stat, reconstruct_multiplier(multipliers_i))) # Sort out the sufficient statistics to identify this conditional's family. s_stat_keys = list(six.iterkeys(s_stat_exprs)) order = np.argsort([str(i) for i in s_stat_keys]) dist_key = tuple((s_stat_keys[i] for i in order)) dist_constructor, constructor_params = ( _suff_stat_to_dist[rv.support].get(dist_key, (None, None))) if dist_constructor is None: raise NotImplementedError('Conditional distribution has sufficient ' 'statistics %s, but no available ' 'exponential-family distribution has those ' 'sufficient statistics.' % str(dist_key)) # Swap sufficient statistics for placeholders, then take gradients # w.r.t. those placeholders to get natural parameters. The original # nodes involving the sufficient statistic nodes are swapped for new # nodes that depend linearly on the sufficient statistic placeholders. s_stat_placeholders = [] swap_dict = {} swap_back = {} for s_stat_expr in six.itervalues(s_stat_exprs): s_stat_placeholder = tf.placeholder(tf.float32, s_stat_expr[0][0].get_shape()) swap_back[s_stat_placeholder] = tf.cast(rv.value(), tf.float32) s_stat_placeholders.append(s_stat_placeholder) for s_stat_node, multiplier in s_stat_expr: fake_node = s_stat_placeholder * multiplier swap_dict[s_stat_node] = fake_node swap_back[fake_node] = s_stat_node for i in cond_set: if i != rv: val = i.value() val_placeholder = tf.placeholder(val.dtype) swap_dict[val] = val_placeholder swap_back[val_placeholder] = val swap_back[val] = val # prevent random variable nodes from being copied scope_name = scope + str(time.time()) # ensure unique scope when copying log_joint_copy = copy(log_joint, swap_dict, scope=scope_name + 'swap') nat_params = tf.gradients(log_joint_copy, s_stat_placeholders) # Remove any dependencies on those old placeholders. nat_params = [copy(nat_param, swap_back, scope=scope_name + 'swapback') for nat_param in nat_params] nat_params = [nat_params[i] for i in order] return dist_constructor(name='cond_dist', **constructor_params(*nat_params))
def build_score_rb_loss_and_gradients(inference, var_list): """Build loss function and gradients based on the score function estimator [@paisley2012variational] and Rao-Blackwellization [@ranganath2014black]. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling and Rao-Blackwellization. """ # Build tensors for loss and gradient calculations. There is one set # for each sample from the variational distribution. p_log_probs = [{}] * inference.n_samples q_log_probs = [{}] * inference.n_samples base_scope = tf.get_default_graph().unique_name("inference") + '/' for s in range(inference.n_samples): # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = base_scope + tf.get_default_graph().unique_name("sample") dict_swap = {} for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) dict_swap[z] = qz_copy.value() q_log_probs[s][qz] = tf.reduce_sum( inference.scale.get(z, 1.0) * qz_copy.log_prob(tf.stop_gradient(dict_swap[z]))) for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_probs[s][z] = tf.reduce_sum( inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_probs[s][x] = tf.reduce_sum( inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) # Take gradients of Rao-Blackwellized loss for each variational parameter. p_rvs = list(six.iterkeys(inference.latent_vars)) + \ [x for x in six.iterkeys(inference.data) if isinstance(x, RandomVariable)] q_rvs = list(six.itervalues(inference.latent_vars)) reverse_latent_vars = {v: k for k, v in six.iteritems(inference.latent_vars)} grads = [] grads_vars = [] for var in var_list: # Get all variational factors depending on the parameter. descendants = get_descendants(tf.convert_to_tensor(var), q_rvs) if len(descendants) == 0: continue # skip if not a variational parameter # Get p and q's Markov blanket wrt these latent variables. var_p_rvs = set() for qz in descendants: z = reverse_latent_vars[qz] var_p_rvs.update(z.get_blanket(p_rvs) + [z]) var_q_rvs = set() for qz in descendants: var_q_rvs.update(qz.get_blanket(q_rvs) + [qz]) pi_log_prob = [0.0] * inference.n_samples qi_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): pi_log_prob[s] = tf.reduce_sum([p_log_probs[s][rv] for rv in var_p_rvs]) qi_log_prob[s] = tf.reduce_sum([q_log_probs[s][rv] for rv in var_q_rvs]) pi_log_prob = tf.stack(pi_log_prob) qi_log_prob = tf.stack(qi_log_prob) grad = tf.gradients( -tf.reduce_mean(qi_log_prob * tf.stop_gradient(pi_log_prob - qi_log_prob)) + tf.reduce_sum(tf.losses.get_regularization_losses()), var) grads.extend(grad) grads_vars.append(var) # Take gradients of total loss function for model parameters. loss = -(tf.reduce_mean([tf.reduce_sum(list(six.itervalues(p_log_prob))) for p_log_prob in p_log_probs]) - tf.reduce_mean([tf.reduce_sum(list(six.itervalues(q_log_prob))) for q_log_prob in q_log_probs]) - tf.reduce_sum(tf.losses.get_regularization_losses())) model_vars = [v for v in var_list if v not in grads_vars] model_grads = tf.gradients(loss, model_vars) grads.extend(model_grads) grads_vars.extend(model_vars) grads_and_vars = list(zip(grads, grads_vars)) return loss, grads_and_vars
def build_score_entropy_loss_and_gradients(inference, var_list): """Build loss function and gradients based on the score function estimator (Paisley et al., 2012). It assumes the entropy is analytic. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples q_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): scope = 'inference_' + str(id(inference)) + '/' + str(s) z_sample = {} for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) z_sample[z] = qz_copy.value() z_log_prob = tf.reduce_sum(qz.log_prob(tf.stop_gradient(z_sample[z]))) if z in inference.scale: z_log_prob *= inference.scale[z] q_log_prob[s] += z_log_prob if inference.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_sample for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope=scope) z_log_prob = tf.reduce_sum(z_copy.log_prob(dict_swap[z])) if z in inference.scale: z_log_prob *= inference.scale[z] p_log_prob[s] += z_log_prob for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) x_log_prob = tf.reduce_sum(x_copy.log_prob(dict_swap[x])) if x in inference.scale: x_log_prob *= inference.scale[x] p_log_prob[s] += x_log_prob else: x = inference.data p_log_prob[s] = inference.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_log_prob = tf.pack(q_log_prob) q_entropy = tf.reduce_sum([inference.data.get(z, 1.0) * qz.entropy() for z, qz in six.iteritems(inference.latent_vars)]) if var_list is None: var_list = tf.trainable_variables() loss = -(tf.reduce_mean(p_log_prob) + q_entropy) grads = tf.gradients( -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_prob)) + q_entropy), [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_reparam_entropy_loss_and_gradients(inference, var_list): """Build loss function. Its automatic differentiation is a stochastic gradient of .. math:: -\\text{ELBO} = -( \mathbb{E}_{q(z; \lambda)} [ \log p(x , z) ] + \mathbb{H}(q(z; \lambda)) ) based on the reparameterization trick (Kingma and Welling, 2014). It assumes the entropy is analytic. Computed by sampling from :math:`q(z;\lambda)` and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): scope = 'inference_' + str(id(inference)) + '/' + str(s) z_sample = {} for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) z_sample[z] = qz_copy.value() if inference.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_sample for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope=scope) z_log_prob = tf.reduce_sum(z_copy.log_prob(dict_swap[z])) if z in inference.scale: z_log_prob *= inference.scale[z] p_log_prob[s] += z_log_prob for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) x_log_prob = tf.reduce_sum(x_copy.log_prob(dict_swap[x])) if x in inference.scale: x_log_prob *= inference.scale[x] p_log_prob[s] += x_log_prob else: x = inference.data p_log_prob[s] = inference.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_entropy = tf.reduce_sum([inference.data.get(z, 1.0) * qz.entropy() for z, qz in six.iteritems(inference.latent_vars)]) loss = -(tf.reduce_mean(p_log_prob) + q_entropy) if var_list is None: var_list = tf.trainable_variables() grads = tf.gradients(loss, [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_loss_and_gradients(self, var_list): """Build loss function $-\Big(\mathbb{E}_{q(\\beta)} [\log p(\\beta) - \log q(\\beta) ] + \sum_{n=1}^N \mathbb{E}_{q(\\beta)q(z_n\mid\\beta)} [ r^*(x_n, z_n, \\beta) ] \Big).$ We minimize it with respect to parameterized variational families $q(z, \\beta; \lambda)$. $r^*(x_n, z_n, \\beta)$ is a function of a single data point $x_n$, single local variable $z_n$, and all global variables $\\beta$. It is equal to the log-ratio $\log p(x_n, z_n\mid \\beta) - \log q(x_n, z_n\mid \\beta),$ where $q(x_n)$ is the empirical data distribution. Rather than explicit calculation, $r^*(x, z, \\beta)$ is the solution to a ratio estimation problem, minimizing the specified `ratio_loss`. Gradients are taken using the reparameterization trick [@kingma2014auto]. #### Notes This also includes model parameters $p(x, z, \\beta; \\theta)$ and variational distributions with inference networks $q(z\mid x)$. There are a bunch of extensions we could easily do in this implementation: + further factorizations can be used to better leverage the graph structure for more complicated models; + score function gradients for global variables; + use more samples; this would require the `copy()` utility function for q's as well, and an additional loop. we opt not to because it complicates the code; + analytic KL/swapping out the penalty term for the globals. """ # Collect tensors used in calculation of losses. scope = tf.get_default_graph().unique_name("inference") qbeta_sample = {} pbeta_log_prob = 0.0 qbeta_log_prob = 0.0 for beta, qbeta in six.iteritems(self.global_vars): # Draw a sample beta' ~ q(beta) and calculate # log p(beta') and log q(beta'). qbeta_sample[beta] = qbeta.value() pbeta_log_prob += tf.reduce_sum(beta.log_prob(qbeta_sample[beta])) qbeta_log_prob += tf.reduce_sum(qbeta.log_prob(qbeta_sample[beta])) pz_sample = {} qz_sample = {} for z, qz in six.iteritems(self.latent_vars): if z not in self.global_vars: # Copy local variables p(z), q(z) to draw samples # z' ~ p(z | beta'), z' ~ q(z | beta'). pz_copy = copy(z, dict_swap=qbeta_sample, scope=scope) pz_sample[z] = pz_copy.value() qz_sample[z] = qz.value() # Collect x' ~ p(x | z', beta') and x' ~ q(x). dict_swap = qbeta_sample.copy() dict_swap.update(qz_sample) x_psample = {} x_qsample = {} for x, x_data in six.iteritems(self.data): if isinstance(x, tf.Tensor): if "Placeholder" not in x.op.type: # Copy p(x | z, beta) to get draw p(x | z', beta'). x_copy = copy(x, dict_swap=dict_swap, scope=scope) x_psample[x] = x_copy x_qsample[x] = x_data elif isinstance(x, RandomVariable): # Copy p(x | z, beta) to get draw p(x | z', beta'). x_copy = copy(x, dict_swap=dict_swap, scope=scope) x_psample[x] = x_copy.value() x_qsample[x] = x_data with tf.variable_scope("Disc"): r_psample = self.discriminator(x_psample, pz_sample, qbeta_sample) with tf.variable_scope("Disc", reuse=True): r_qsample = self.discriminator(x_qsample, qz_sample, qbeta_sample) # Form ratio loss and ratio estimator. if len(self.scale) <= 1: loss_d = tf.reduce_mean(self.ratio_loss(r_psample, r_qsample)) scale = list(six.itervalues(self.scale)) scale = scale[0] if scale else 1.0 scaled_ratio = tf.reduce_sum(scale * r_qsample) else: loss_d = [tf.reduce_mean(self.ratio_loss(r_psample[key], r_qsample[key])) for key in six.iterkeys(self.scale)] loss_d = tf.reduce_sum(loss_d) scaled_ratio = [tf.reduce_sum(self.scale[key] * r_qsample[key]) for key in six.iterkeys(self.scale)] scaled_ratio = tf.reduce_sum(scaled_ratio) reg_terms_d = tf.losses.get_regularization_losses(scope="Disc") reg_terms_all = tf.losses.get_regularization_losses() reg_terms = [r for r in reg_terms_all if r not in reg_terms_d] # Form variational objective. loss = -(pbeta_log_prob - qbeta_log_prob + scaled_ratio - tf.reduce_sum(reg_terms)) loss_d = loss_d + tf.reduce_sum(reg_terms_d) var_list_d = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc") if var_list is None: var_list = [v for v in tf.trainable_variables() if v not in var_list_d] grads = tf.gradients(loss, var_list) grads_d = tf.gradients(loss_d, var_list_d) grads_and_vars = list(zip(grads, var_list)) grads_and_vars_d = list(zip(grads_d, var_list_d)) return loss, grads_and_vars, loss_d, grads_and_vars_d
def _test_copy(RV, value, *args, **kwargs): rv1 = RV(*args, value=value, **kwargs) rv2 = copy(rv1) value_shape1 = rv1._value.get_shape().as_list() value_shape2 = rv2._value.get_shape().as_list() assert value_shape1 == value_shape2
def build_loss_and_gradients(self, var_list): """Build loss function .. math:: KL( p(z |x) || q(z) ) = E_{p(z | x)} [ \log p(z | x) - \log q(z; \lambda) ] and stochastic gradients based on importance sampling. The loss function can be estimated as .. math:: 1/B \sum_{b=1}^B [ w_{norm}(z^b; \lambda) * (\log p(x, z^b) - \log q(z^b; \lambda) ], where .. math:: z^b \sim q(z^b; \lambda), w_{norm}(z^b; \lambda) = w(z^b; \lambda) / \sum_{b=1}^B (w(z^b; \lambda)), w(z^b; \lambda) = p(x, z^b) / q(z^b; \lambda). This provides a gradient, .. math:: - 1/B \sum_{b=1}^B [ w_{norm}(z^b; \lambda) * \partial_{\lambda} \log q(z^b; \lambda) ]. """ p_log_prob = [0.0] * self.n_samples q_log_prob = [0.0] * self.n_samples for s in range(self.n_samples): z_sample = {} for z, qz in six.iteritems(self.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope='inference_' + str(s)) z_sample[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( qz.log_prob(tf.stop_gradient(z_sample[z]))) if self.model_wrapper is None: # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. dict_swap = z_sample for x, qx in six.iteritems(self.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope='inference_' + str(s)) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z in six.iterkeys(self.latent_vars): z_copy = copy(z, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum( z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(self.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope='inference_' + str(s)) p_log_prob[s] += tf.reduce_sum( x_copy.log_prob(dict_swap[x])) else: x = self.data p_log_prob[s] = self.model_wrapper.log_prob(x, z_sample) p_log_prob = tf.pack(p_log_prob) q_log_prob = tf.pack(q_log_prob) log_w = p_log_prob - q_log_prob log_w_norm = log_w - log_sum_exp(log_w) w_norm = tf.exp(log_w_norm) loss = tf.reduce_mean(w_norm * log_w) grads = tf.gradients( -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm)), [v.ref() for v in var_list]) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def build_score_entropy_loss_and_gradients(inference, var_list): """Build loss function and gradients based on the score function estimator [@paisley2012variational]. It assumes the entropy is analytic. Computed by sampling from $q(z;\lambda)$ and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples q_log_prob = [0.0] * inference.n_samples base_scope = tf.get_default_graph().unique_name("inference") + '/' for s in range(inference.n_samples): # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = base_scope + tf.get_default_graph().unique_name("sample") dict_swap = {} for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) dict_swap[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * qz_copy.log_prob(tf.stop_gradient(dict_swap[z]))) for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) p_log_prob = tf.stack(p_log_prob) q_log_prob = tf.stack(q_log_prob) q_entropy = tf.reduce_sum([ tf.reduce_sum(qz.entropy()) for z, qz in six.iteritems(inference.latent_vars)]) reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses()) if inference.logging: tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob), collections=[inference._summary_key]) tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob), collections=[inference._summary_key]) tf.summary.scalar("loss/q_entropy", q_entropy, collections=[inference._summary_key]) tf.summary.scalar("loss/reg_penalty", reg_penalty, collections=[inference._summary_key]) loss = -(tf.reduce_mean(p_log_prob) + q_entropy - reg_penalty) q_rvs = list(six.itervalues(inference.latent_vars)) q_vars = [v for v in var_list if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0] q_grads = tf.gradients( -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_prob)) + q_entropy - reg_penalty), q_vars) p_vars = [v for v in var_list if v not in q_vars] p_grads = tf.gradients(loss, p_vars) grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars)) return loss, grads_and_vars