def _create_components(self, deterministic=False): # load network input X = self.inputs[0] x = X.flatten(2) # load networks l_p_mu, l_q_mu, l_q_sample, _, _, _ = self.network l_q_in, l_p_in, l_cv_in = self.input_layers # load network output z, q_mu = lasagne.layers.get_output( [l_q_sample, l_q_mu], deterministic=deterministic) p_mu = lasagne.layers.get_output( l_p_mu, {l_p_in: z}, deterministic=deterministic, ) # entropy term log_qz_given_x = log_bernoulli(dg(z), q_mu).sum(axis=1) # expected p(x,z) term z_prior = T.ones_like(z)*np.float32(0.5) log_pz = log_bernoulli(z, z_prior).sum(axis=1) log_px_given_z = log_bernoulli(x, p_mu).sum(axis=1) log_pxz = log_pz + log_px_given_z # save them for later self.log_pxz = log_pxz self.log_qz_given_x = log_qz_given_x return log_pxz.flatten(), log_qz_given_x.flatten()
def create_llik(self): # load inputs X = self.inputs[0] x = X.flatten(2) # load network params n_cat = self.n_lat n_rep = 10 # load networks l_p_mu, l_q_mu, l_q_sample, _, _, _ = self.network l_q_in, l_p_in, l_cv_in = self.input_layers # load network output q_mu = lasagne.layers.get_output(l_q_mu) q_mu_rep = T.tile( q_mu.dimshuffle((0,'x',1)), reps=(1,n_rep,1) ) # (n_bat, n_rep, n_cat) q_sample_hard = self.theano_rng.binomial(size=q_mu_rep.shape, p=q_mu_rep, dtype=q_mu_rep.dtype) # (n_bat, n_rep, n_cat) q_sample_hard2 = q_sample_hard.reshape([100*n_rep, n_cat]) # (n_bat*n_rep, n_cat) p_mu = lasagne.layers.get_output(l_p_mu, {l_p_in: q_sample_hard2}) # (n_bat*n_rep, 784) x_rep = T.tile( x.dimshuffle((0,'x',1)), reps=(1,n_rep,1) ) # (n_bat, n_rep, 784) p_mu = T.reshape(p_mu, (100, n_rep, 784)) # (n_bat, n_rep, 784) # define the loss components log_p_x = log_bernoulli(x_rep, p_mu).sum(axis=2) # (n_bat, n_rep) z_prior = T.ones_like(q_sample_hard)*np.float32(0.5) # (n_bat, n_rep, n_cat) log_p_z = log_bernoulli(q_sample_hard, z_prior).sum(axis=2) # (n_bat, n_rep) log_q_z = log_bernoulli(q_sample_hard, q_mu_rep).sum(axis=2) # (n_bat, n_rep) # compute loss llik = Tlogsumexp( log_p_x + log_p_z - log_q_z, axis=1) # (n_bat,) return T.mean(llik)
def create_llik(self): # load network input X = self.inputs[0] x = X.flatten(2) # duplicate entries to take into account multiple mc samples n_sam = self.n_sample n_out = x.shape[1] x = x.dimshuffle(0, 'x', 1).repeat(n_sam, axis=1).reshape((-1, n_out)) # load networks l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, _, _, _ = self.network l_qa_in, l_qz_in, l_px_in, l_cv_in = self.input_layers # load network output qa_mu, qa_logsigma, a = lasagne.layers.get_output( [l_qa_mu, l_qa_logsigma, l_qa], ) qz_mu, z = lasagne.layers.get_output( [l_qz_mu, l_qz], { l_qz_in: a, l_qa_in: X }, ) pa_mu, pa_logsigma = lasagne.layers.get_output( [l_pa_mu, l_pa_logsigma], {l_px_in: z}, ) px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in: z}) # entropy term log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1) log_qz_given_x = log_bernoulli(z, qz_mu).sum(axis=1) log_qz_given_x_dgz = log_bernoulli(dg(z), qz_mu).sum(axis=1) log_qza_given_x = log_qz_given_x + log_qa_given_x # log-probability term # z_prior = T.ones_like(z)*np.float32(0.5) # log_pz = log_bernoulli(z, z_prior).sum(axis=1) log_e = -self.rbm.free_energy(z.reshape((128 * n_sam, self.n_lat))) log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1) t = log_pa_given_z + log_px_given_z + log_e - log_qz_given_x - log_qa_given_x t = t.reshape([128, n_sam]) # compute loss llik = Tlogsumexp(t, axis=1) # (n_bat,) return T.mean(llik)
def create_objectives_elbo(self, deterministic=False): """ELBO objective without the analytic expectation trick""" # load network input X = self.inputs[0] x = X.flatten(2) # load network output if self.model == 'bernoulli': q_mu, q_logsigma, p_mu, z \ = lasagne.layers.get_output(self.network[2:], deterministic=deterministic) elif self.model == 'gaussian': raise NotImplementedError() # entropy term log_qz_given_x = log_normal2(z, q_mu, q_logsigma).sum(axis=1) # expected p(x,z) term z_prior_sigma = T.cast(T.ones_like(q_logsigma), dtype=theano.config.floatX) z_prior_mu = T.cast(T.zeros_like(q_mu), dtype=theano.config.floatX) log_pz = log_normal(z, z_prior_mu, z_prior_sigma).sum(axis=1) log_px_given_z = log_bernoulli(x, p_mu).sum(axis=1) log_pxz = log_pz + log_px_given_z elbo = (log_pxz - log_qz_given_x).mean() # we don't use the spearate accuracy metric right now return -elbo, -log_qz_given_x.mean()
def create_objectives(self, deterministic=False): # load network input X = self.inputs[0] x = X.flatten(2) # duplicate entries to take into account multiple mc samples n_sam = self.n_sample n_out = x.shape[1] x = x.dimshuffle(0, 'x', 1).repeat(n_sam, axis=1).reshape((-1, n_out)) # load network l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, \ l_qa, l_qz = self.network # load network output pa_mu, pa_logsigma, qz_mu, qz_logsigma, qa_mu, qa_logsigma, a, z \ = lasagne.layers.get_output( [ l_pa_mu, l_pa_logsigma, l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, l_qa, l_qz ], deterministic=deterministic) if self.model == 'bernoulli': px_mu = lasagne.layers.get_output(l_px_mu, deterministic=deterministic) elif self.model == 'gaussian': px_mu, px_logsigma = lasagne.layers.get_output( [l_px_mu, l_px_logsigma], deterministic=deterministic) # entropy term log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1) log_qz_given_ax = log_normal2(z, qz_mu, qz_logsigma).sum(axis=1) log_qza_given_x = log_qz_given_ax + log_qa_given_x # log-probability term z_prior_sigma = T.cast(T.ones_like(qz_logsigma), dtype=theano.config.floatX) z_prior_mu = T.cast(T.zeros_like(qz_mu), dtype=theano.config.floatX) log_pz = log_normal(z, z_prior_mu, z_prior_sigma).sum(axis=1) log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1) if self.model == 'bernoulli': log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) elif self.model == 'gaussian': log_px_given_z = log_normal2(x, px_mu, px_logsigma).sum(axis=1) log_paxz = log_pa_given_z + log_px_given_z + log_pz # # experiment: uniform prior p(a) # a_prior_sigma = T.cast(T.ones_like(qa_logsigma), dtype=theano.config.floatX) # a_prior_mu = T.cast(T.zeros_like(qa_mu), dtype=theano.config.floatX) # log_pa = log_normal(a, a_prior_mu, a_prior_sigma).sum(axis=1) # log_paxz = log_pa + log_px_given_z + log_pz # compute the evidence lower bound elbo = T.mean(log_paxz - log_qza_given_x) # we don't use a spearate accuracy metric right now return -elbo, T.max(qz_logsigma)
def create_gradients(self, loss, deterministic=False): from theano.gradient import disconnected_grad as dg # load network input X = self.inputs[0] x = X.flatten(2) # load network output if self.model == 'bernoulli': q_mu, q_logsigma, p_mu, z \ = lasagne.layers.get_output(self.network[2:], deterministic=deterministic) elif self.model == 'gaussian': raise NotImplementedError() # load params p_params, q_params = self._get_net_params() # entropy term log_qz_given_x = log_normal2(z, q_mu, q_logsigma).sum(axis=1) # expected p(x,z) term z_prior_sigma = T.cast(T.ones_like(q_logsigma), dtype=theano.config.floatX) z_prior_mu = T.cast(T.zeros_like(q_mu), dtype=theano.config.floatX) log_pz = log_normal(z, z_prior_mu, z_prior_sigma).sum(axis=1) log_px_given_z = log_bernoulli(x, p_mu).sum(axis=1) log_pxz = log_pz + log_px_given_z # compute learning signals l = log_pxz - log_qz_given_x # l_avg, l_std = l.mean(), T.maximum(1, l.std()) # c_new = 0.8*c + 0.2*l_avg # v_new = 0.8*v + 0.2*l_std # l = (l - c_new) / v_new # compute grad wrt p p_grads = T.grad(-log_pxz.mean(), p_params) # compute grad wrt q # q_target = T.mean(dg(l) * log_qz_given_x) # q_grads = T.grad(-0.2*q_target, q_params) # 5x slower rate for q log_qz_given_x = log_normal2(dg(z), q_mu, q_logsigma).sum(axis=1) q_target = T.mean(dg(l) * log_qz_given_x) q_grads = T.grad(-0.2 * q_target, q_params) # 5x slower rate for q # q_grads = T.grad(-l.mean(), q_params) # 5x slower rate for q # # compute grad of cv net # cv_target = T.mean(l**2) # cv_grads = T.grad(cv_target, cv_params) # combine and clip gradients clip_grad = 1 max_norm = 5 grads = p_grads + q_grads mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] return cgrads
def create_dadgm_objectives(self, deterministic=False): X = self.inputs[0] x = X.reshape((-1, self.n_out)) px_mu = self.inputs[-1] # load network params n_class = self.n_class n_cat = self.n_cat pa_net_mu, pa_net_logsigma, qz_net_mu, qa_net_mu, \ qa_net_logsigma, qz_net_sample, qa_net_sample = self.network qa_net_in, qz_net_in, px_net_in = self.input_layers qa_mu, qa_logsigma, qa_sample = get_output( [qa_net_mu, qa_net_logsigma, qa_net_sample], deterministic=deterministic, ) qz_mu, qz_sample = get_output( [qz_net_mu, qz_net_sample], { qz_net_in: qa_sample, qa_net_in: x }, deterministic=deterministic, ) pa_mu, pa_logsigma = get_output( [pa_net_mu, pa_net_logsigma], {px_net_in: qz_sample}, deterministic=deterministic, ) # Load this from RBM px_mu = self.inputs[-1] qz_given_ax = T.nnet.softmax(qz_mu) log_qz_given_ax = T.log(qz_given_ax + 1e-20) entropy = T.reshape( qz_given_ax * (log_qz_given_ax - T.log(1.0 / n_class)), (-1, n_cat, n_class), ) entropy = T.sum(entropy, axis=[1, 2]) log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) log_pa_given_z = log_normal2(qa_sample, pa_mu, pa_logsigma).sum(axis=1) log_paxz = log_pa_given_z + log_px_given_z # logp(z)+logp(a|z)-logq(a)-logq(z|a) elbo = T.mean(log_paxz - entropy) return -elbo, -T.mean(entropy)
def decode(self, x, z): # k = z.size()[0] # B = z.size()[1] # z = z.view(-1, self.z_size) out = z for i in range(len(self.decoder_weights) - 1): out = self.act_func(self.decoder_weights[i](out)) # out = self.act_func(self.layer_norms[i].forward(self.decoder_weights[i](out))) out = self.decoder_weights[-1](out) logpx = log_bernoulli(pred_no_sig=out, target=x) # x = out.view(k, B, self.x_size) return torch.sigmoid(out), logpx
def _create_components(self, D): # collect samples (l_qx, l_qx_samp, l_pa_mu, l_pa_logsigma) = self.network a = self.A qx, x = lasagne.layers.get_output([l_qx, l_qx_samp], a) pa_mu, pa_logsigma = lasagne.layers.get_output( [l_pa_mu, l_pa_logsigma], x) # compute logQ logQa = T.sum(log_normal(a, 0., 1.), axis=1) logQx_given_a = T.sum(log_bernoulli(x, qx), axis=1) logQ = logQa + logQx_given_a # compute energies of the samples, dim=(1, n_tot_samples) logFx = self._free_energy(x.T, marginalize=self.marginalize) logpa = T.sum(log_normal2(a, pa_mu, pa_logsigma), axis=1) # logF = logFx + logpa # free energy of the data D = D.reshape((-1, self.n_visible)).T logF_D = self._free_energy(D) self._components = (logFx, logpa, logQ, logF_D)
def _create_components(self, deterministic=False): # load network input X = self.inputs[0] x = X.flatten(2) # load networks l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, _, _, _ = self.network l_qa_in, l_qz_in, l_px_in = self.input_layers # load network output qa_mu, qa_logsigma, a = lasagne.layers.get_output( [l_qa_mu, l_qa_logsigma, l_qa], deterministic=deterministic) qz_mu, z = lasagne.layers.get_output( [l_qz_mu, l_qz], # {l_qz_in : T.zeros_like(qa_mu), l_qa_in : X}, # {l_qz_in : qa_mu, l_qa_in : X}, { l_qz_in: a, l_qa_in: X }, deterministic=deterministic) pa_mu, pa_logsigma = lasagne.layers.get_output( [l_pa_mu, l_pa_logsigma], z, deterministic=deterministic) if self.model == 'bernoulli': px_mu = lasagne.layers.get_output(l_px_mu, z, deterministic=deterministic) elif self.model == 'gaussian': px_mu, px_logsigma = lasagne.layers.get_output( [l_px_mu, l_px_logsigma], z, deterministic=deterministic) # entropy term log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1) log_qz_given_x = log_bernoulli(z, qz_mu).sum(axis=1) log_qz_given_x_dgz = log_bernoulli(dg(z), qz_mu).sum(axis=1) # log_qz_given_x = log_normal2(z, qz_mu, qz_logsigma).sum(axis=1) # log_qz_given_x_dgz = log_normal2(dg(z), qz_mu, qz_logsigma).sum(axis=1) log_qza_given_x = log_qz_given_x + log_qa_given_x # log-probability term z_prior = T.ones_like(z) * np.float32(0.5) log_pz = log_bernoulli(z, z_prior).sum(axis=1) # z_prior_sigma = T.cast(T.ones_like(qz_logsigma), dtype=theano.config.floatX) # z_prior_mu = T.cast(T.zeros_like(qz_mu), dtype=theano.config.floatX) # log_pz = log_normal(z, z_prior_mu, z_prior_sigma).sum(axis=1) log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1) log_pxz = log_pa_given_z + log_px_given_z + log_pz # save them for later if deterministic == False: self.log_pxz = log_pxz self.log_px_given_z = log_px_given_z self.log_pz = log_pz self.log_qza_given_x = log_qza_given_x self.log_qa_given_x = log_qa_given_x self.log_qz_given_x = log_qz_given_x self.log_qz_given_x_dgz = log_qz_given_x_dgz # return log_paxz, log_qza_given_x return log_pxz, log_qza_given_x
def create_objectives(self, deterministic=False): # load network input X = self.inputs[0] Y = self.inputs[1] x = X.flatten(2) # duplicate entries to take into account multiple mc samples n_sam = self.n_sample n_out = x.shape[1] x = x.dimshuffle(0, 'x', 1).repeat(n_sam, axis=1).reshape((-1, n_out)) # load network l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, \ l_qa, l_qz, l_d = self.network # load network output pa_mu, pa_logsigma, qz_mu, qz_logsigma, qa_mu, qa_logsigma, a, z \ = lasagne.layers.get_output( [ l_pa_mu, l_pa_logsigma, l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, l_qa, l_qz ], deterministic=deterministic) if self.model == 'bernoulli': px_mu = lasagne.layers.get_output(l_px_mu, deterministic=deterministic) elif self.model == 'gaussian': px_mu, px_logsigma = lasagne.layers.get_output( [l_px_mu, l_px_logsigma], deterministic=deterministic) # entropy term log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1) log_qz_given_ax = log_normal2(z, qz_mu, qz_logsigma).sum(axis=1) log_qza_given_x = log_qz_given_ax + log_qa_given_x # log-probability term z_prior_sigma = T.cast(T.ones_like(qz_logsigma), dtype=theano.config.floatX) z_prior_mu = T.cast(T.zeros_like(qz_mu), dtype=theano.config.floatX) log_pz = log_normal(z, z_prior_mu, z_prior_sigma).sum(axis=1) log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1) if self.model == 'bernoulli': log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) elif self.model == 'gaussian': log_px_given_z = log_normal2(x, px_mu, px_logsigma).sum(axis=1) log_paxz = log_pa_given_z + log_px_given_z + log_pz # discriminative component P = lasagne.layers.get_output(l_d) P_test = lasagne.layers.get_output(l_d, deterministic=True) disc_loss = lasagne.objectives.categorical_crossentropy(P, Y) # measure accuracy top = theano.tensor.argmax(P, axis=-1) top_test = theano.tensor.argmax(P_test, axis=-1) acc = theano.tensor.eq(top, Y).mean() acc_test = theano.tensor.eq(top_test, Y).mean() # compute the evidence lower bound elbo = T.mean(-disc_loss + log_paxz - log_qza_given_x) # elbo = T.mean(-disc_loss) if deterministic: return -elbo, acc_test else: return -elbo, acc
def _create_components(self, deterministic=False): # load network input X = self.inputs[0] x = X.flatten(2) # duplicate entries to take into account multiple mc samples n_sam = self.n_sample n_out = x.shape[1] x = x.dimshuffle(0, 'x', 1).repeat(n_sam, axis=1).reshape((-1, n_out)) # load networks l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, _, _, _ = self.network l_qa_in, l_qz_in, l_px_in, l_cv_in = self.input_layers # load network output qa_mu, qa_logsigma, a = lasagne.layers.get_output( [l_qa_mu, l_qa_logsigma, l_qa], deterministic=deterministic, ) qz_mu, z = lasagne.layers.get_output( [l_qz_mu, l_qz], { l_qz_in: a, l_qa_in: X }, deterministic=deterministic, ) pa_mu, pa_logsigma = lasagne.layers.get_output( [l_pa_mu, l_pa_logsigma], {l_px_in: z}, deterministic=deterministic, ) if self.model == 'bernoulli': px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in: z}, deterministic=deterministic) elif self.model == 'gaussian': px_mu, px_logsigma = lasagne.layers.get_output( [l_px_mu, l_px_logsigma], {l_px_in: z}, deterministic=deterministic, ) # entropy term log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1) log_qz_given_x = log_bernoulli(z, qz_mu).sum(axis=1) log_qz_given_x_dgz = log_bernoulli(dg(z), qz_mu).sum(axis=1) log_qza_given_x = log_qz_given_x + log_qa_given_x # log-probability term z_prior = T.ones_like(z) * np.float32(0.5) log_pz = log_bernoulli(z, z_prior).sum(axis=1) log_e = -self.rbm.free_energy(z.reshape((128 * n_sam, self.n_lat))) log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1) log_pxz = log_pa_given_z + log_px_given_z + log_e # save them for later if deterministic == False: self.log_pxz = log_pxz self.log_px_given_z = log_px_given_z self.log_pz = log_pz self.log_qza_given_x = log_qza_given_x self.log_qa_given_x = log_qa_given_x self.log_qz_given_x = log_qz_given_x self.log_qz_given_x_dgz = log_qz_given_x_dgz self.log_e = log_e.mean() self.z = z # return log_paxz, log_qza_given_x return log_pxz, log_qza_given_x