def sample(self, y, n=None): """Sample from the fitted probabilistic CCA model. :param n: The number of samples. :return: Two views of n samples each. """ k = self.latent_dim Lambda, Psi_diag = self.tile_params() if n and n > y.shape[1]: raise AttributeError('More samples than estimated z variables.') elif not n: n = y.shape[1] z = torch.empty(k, n) for i in range(n): yi = y[:, i] z[:, i] = self.E_z_given_y(Lambda, Psi_diag, yi) m1 = self.Lambda1 @ z m2 = self.Lambda2 @ z y1 = torch.empty(self.p1, n) y2 = torch.empty(self.p2, n) for i in range(n): y1[:, i] = MVN(m1[:, i], diag(self.Psi1_diag)).sample() y2[:, i] = MVN(m2[:, i], diag(self.Psi2_diag)).sample() return y1.t(), y2.t()
def sample(self, y, n_samples): """Sample from the fitted probabilistic CCA model. :param n: The number of samples. :return: Two views of n samples each. """ k = 3 * self.latent_dim if self.private_z else self.latent_dim Lambda, Psi_diag = self.tile_params() PLL_inv = LA.woodbury_inv(Psi_diag, Lambda, Lambda.t(), k) z = self.E_z_given_y(Lambda, PLL_inv, y) m = Lambda @ z m1 = m[:self.p1] m2 = m[self.p1:] y1 = torch.empty(self.p1, n_samples) y2 = torch.empty(self.p2, n_samples) for i in range(n_samples): # Randomly select a latent variable. r = random.randint(0, z.shape[1]) # Sample y using the mean for the chosen latent variable. y1[:, i] = MVN(m1[:, r], diag(self.Psi1_diag)).sample() y2[:, i] = MVN(m2[:, r], diag(self.Psi2_diag)).sample() return y1.t(), y2.t()
def init_VL_sampler(self): from torch.distributions.multivariate_normal import MultivariateNormal as MVN view_mvn_path = self.cfgs.get('view_mvn_path', 'checkpoints/view_light/view_mvn.pth') light_mvn_path = self.cfgs.get('light_mvn_path', 'checkpoints/view_light/light_mvn.pth') view_mvn = torch.load(view_mvn_path) light_mvn = torch.load(light_mvn_path) self.view_mean = view_mvn['mean'].cuda() self.light_mean = light_mvn['mean'].cuda() self.view_mvn = MVN(view_mvn['mean'].cuda(), view_mvn['cov'].cuda()) self.light_mvn = MVN(light_mvn['mean'].cuda(), light_mvn['cov'].cuda())
def get_projections(self, data, J, projection='two'): """ Get projections for ACS approximate procedure :param data: (Object) Data object to get projections for :param J: (int) Number of projections to use :param projection: (str) Type of projection to use (currently only 'two' supported) :return: (torch.tensor) Projections """ projections = [] with torch.no_grad(): theta_mean, theta_cov = self.linear._compute_posterior( self.encode(self.x_train), self.y_train) jitter = utils.to_gpu(torch.eye(len(theta_cov)) * 1e-4) try: theta_samples = MVN(theta_mean.flatten(), theta_cov + jitter).sample(torch.Size([J])) except: import pdb pdb.set_trace() dataloader = DataLoader(Dataset(data, 'unlabeled'), batch_size=len(data.index['unlabeled']), shuffle=False) for (x, _) in dataloader: x = utils.to_gpu(x) if projection == 'two': for theta_sample in theta_samples: projections.append( self._compute_expected_ll(x, theta_sample)) else: raise NotImplementedError return utils.to_gpu(torch.sqrt(1 / torch.FloatTensor( [J]))) * torch.cat(projections, dim=1), torch.zeros(len(x))
def test_neg_loglik(self): """ Compute negative log-likelihood of test set. """ self.q_params = self.all_variationals[-1][0] samples, _ = self.sample_q(self.config["bbb_nsamples"]) results = np.apply_along_axis(lambda w: self.forward(self.X_test, weights=torch.Tensor(w)).numpy(), 1, samples) means = torch.tensor(np.mean(results, axis=0)) return -1 * MVN(means, self.config["sigma_noise"] * torch.eye(self.Ydim)).log_prob(self.Y_test).sum()
def test_neg_loglik(self): """ Compute negative log-likelihood of test set. """ results = np.apply_along_axis( lambda w: self.forward(self.X_test, weights=torch.Tensor(w)).numpy( ), 1, self.particles) means = torch.tensor(np.mean(results, axis=0)) return -1 * MVN(means, self.config["sigma_noise"] * torch.eye(self.Ydim)).log_prob(self.Y_test).sum()
def two_gaussians(n, covariance=[1, 0, 0, 1], transforms=[(lambda x: x), (lambda y: y)]): sampler = MVN(loc=torch.zeros(2), covariance_matrix=torch.Tensor( [covariance[0:2], covariance[2:4]])) X, Y = sampler.sample((n, )).t() X, Y = transforms[0](X), transforms[1](Y) return X.view(-1, 1), Y.view(-1, 1)
def get_projections(self, data, J, projection='two', gamma=0, transform=None, **kwargs): """ Get projections for ACS approximate procedure :param data: (Object) Data object to get projections for :param J: (int) Number of projections to use :param projection: (str) Type of projection to use (currently only 'two' supported) :return: (torch.tensor) Projections """ ent = lambda py: torch.distributions.Categorical(probs=py).entropy() projections = [] feat_x = [] with torch.no_grad(): mean, cov = self.linear._compute_posterior() jitter = to_gpu(torch.eye(len(cov)) * 1e-6) theta_samples = MVN(mean, cov + jitter).sample(torch.Size([J])).view( J, -1, self.linear.out_features) ''' dataloader = DataLoader(Dataset(data, 'unlabeled', transform=transform), batch_size=256, shuffle=False) ''' idx_lb = data.index['unlabeled'] handler = DataHandler(X=data.X[idx_lb], Y=data.Y[idx_lb], transform=self.args['transform']) dataloader = DataLoader(handler, shuffle=False, batch_size=256, num_workers=0) for (x, _, _) in dataloader: x = to_gpu(x) feat_x.append(self.encode(x)) feat_x = torch.cat(feat_x) py = self._compute_predictive_posterior(self.linear( feat_x, num_samples=100), logits=False) ent_x = ent(py) if projection == 'two': for theta_sample in theta_samples: projections.append( self._compute_expected_ll(feat_x, theta_sample, py) + gamma * ent_x[:, None]) else: raise NotImplementedError return to_gpu(torch.sqrt(1 / torch.FloatTensor([J]))) * torch.cat( projections, dim=1), ent_x
def sample(self, y, n_samples=None, one_sample_per_y=False): """Sample from the fitted probabilistic CCA model. :param y: Observations of shape (n_features, n_samples). :param n_samples: The number of samples. :return: Two views of n samples each. """ k = 3 * self.latent_dim if one_sample_per_y: if n_samples and n_samples != y.shape[1]: msg = 'When sampling once per `y`, `n_samples` must be the' \ 'number of samples of `y`.' raise AttributeError(msg) n_samples = y.shape[1] Lambda, Psi_diag = self.tile_params() PLL_inv = LA.woodbury_inv(Psi_diag, Lambda, Lambda.t(), k) z = self.E_z_given_y(Lambda, PLL_inv, y) m = Lambda @ z m1 = m[:self.p1] m2 = m[self.p1:] y1r = torch.empty(self.p1, n_samples, device=device) y2r = torch.empty(self.p2, n_samples, device=device) for i in range(n_samples): if one_sample_per_y: # Sample based on the estimated mean for the current `y`. j = i else: # Sample based on a randomly chosen latent variable. j = random.randint(0, z.shape[1] - 1) y1r[:, i] = MVN(m1[:, j], diag(exp(self.log_Psi1_diag))).sample() y2r[:, i] = MVN(m2[:, j], diag(exp(self.log_Psi2_diag))).sample() return y1r.t(), y2r.t()
def log_likelihood(self, batch_indices=None): """ Computes log-likelihood term. """ if batch_indices is None: batch = self.X_train target = self.Y_train multiplier = 1 else: batch = self.X_train[batch_indices] target = self.Y_train[batch_indices] multiplier = (self.N_train / len(batch_indices)) means = self.forward(X=batch) if self.Ydim == 1: return multiplier * self.noise_dist.log_prob(means - target).sum() return multiplier * MVN( means, self.config["sigma_noise"] * torch.eye(self.Ydim)).log_prob(target).sum()
def log_likelihood(self, batch_indices=None): """ Computes the likelihood. """ if batch_indices is None: batch = self.X_train target = self.Y_train multiplier = 1 else: batch = self.X_train[batch_indices] target = self.Y_train[batch_indices] multiplier = (self.N_train / len(batch_indices)) means = self.forward(X=batch) if self.Ydim == 1: return multiplier * Normal( 0, self.sigma_noise).log_prob(means - target).sum() return multiplier * MVN(means, self.sigma_noise * torch.eye(self.Ydim)).log_prob(target).sum()
def positive_gaussian_cocp(self): """ Conditional output-constrained prior: mixture of Gaussian. Assume uniform mixing weights for each mixture. Assume isotropic Gaussian. """ nn_mean = self.forward(X=self._cr_pos_xsamples) index = 0 log_prob = torch.tensor(0.0) for i, (dom, ifunc) in enumerate( self.dconstraints['positive_gaussian_cocp']): sub_nsamples = self._cr_ylens[i] * self.ocp_nsamples dist = MVN(self._cr_pos_ysamples[index:index + sub_nsamples, :], self.cocp_gaussian_sigma_c * torch.eye(self.Ydim)).log_prob(nn_mean[index:index + sub_nsamples, :]) dist += torch.log(torch.tensor(1 / self._cr_ylens[i])) log_prob += torch.logsumexp(torch.stack(dist.split( self.ocp_nsamples), dim=0), dim=0).sum() index += sub_nsamples return log_prob
def nce_test(): """ Test implementation of NCE for Gaussian """ #specify data size data_dim = 5 Td = 100000 noise_ratio = 50 Tn = Td * noise_ratio Td_batch = 1000 Tn_batch = Td_batch * noise_ratio #create Pd and create artificial data cov_base = th.tensor(make_spd_matrix(data_dim), dtype=th.float) tril_mat = th.tril(cov_base) cov_mat = th.matmul(tril_mat, tril_mat.t()) true_c = -0.5 * th.log(th.abs(th.det(cov_mat))) - (data_dim / 2) * th.log( 2 * th.tensor(np.pi)) p_data = MVN(th.zeros(data_dim), scale_tril=tril_mat) data_labels = th.ones(Td) data_sample = th.utils.data.TensorDataset(p_data.sample((Td, )), data_labels) data_loader = th.utils.data.DataLoader(data_sample, batch_size=Td_batch, shuffle=True) #specify noise parameters for later use noise_cov_mat = th.eye(data_dim) #set up the model to be estimated cov_model = th.tensor(make_spd_matrix(data_dim), dtype=th.float) tril_mat_model = th.tril(cov_model) model = UnnormMVGaussian(th.zeros(data_dim), scale_tril=tril_mat_model) model.scale_tril.requires_grad = True model.normalizing_constant.requires_grad = True #set up optimization parameters start_epoch = 0 end_epoch = 1000 start_lr = 0.001 momentum = 0.9 decay_epochs = [50, 100, 250, 500, 750] decay_gamma = 0.1 optimizer = th.optim.Adam([model.scale_tril, model.normalizing_constant], lr=start_lr) lr_sched = th.optim.lr_scheduler.MultiStepLR(optimizer, milestones=decay_epochs, gamma=decay_gamma) #train for epoch in range(start_epoch, end_epoch): lr_sched.step() print(epoch) for i, (data_batch, data_labels) in enumerate(data_loader): #sample noise data for current input batch noise_distr = MVN(th.zeros(data_dim), noise_cov_mat) noise_batch = noise_distr.sample((Tn_batch, )) noise_labels = th.zeros(Tn_batch) #combine data and noise samples joint_batch = th.cat((data_batch, noise_batch), 0) joint_labels = th.cat((data_labels, noise_labels), 0) #forward pass log_P_model = model.log_prob(joint_batch) log_P_noise = noise_distr.log_prob(joint_batch) log_P_diff = log_P_model - log_P_noise + 1e-20 loss = NCE_loss(log_P_diff, joint_labels, Td_batch, noise_ratio) print(loss.item(), true_c.item(), model.normalizing_constant.item()) print(F.mse_loss(model.scale_tril, p_data.scale_tril)) #backward pass optimizer.zero_grad() loss.backward() optimizer.step() noise_cov_mat = th.chain_matmul(model.scale_tril.detach(), model.scale_tril.detach().t()) pdb.set_trace()
def vdapc_loss(latent_dist, latent_sample, latent_mask, T, cov, post_L, alpha=0., beta=0., gamma=1., zeta=1.): batch_size, seq_len, d = latent_sample.shape ### Junwen: compute the log prob terms for each 3*3 block # Weiran: sample across different utts. latent_mu = latent_dist[0].reshape(-1, d) latent_logvar = latent_dist[1].reshape(-1, d) mask = latent_mask.reshape(-1).float() # This gives indices of valid samples. idx = mask.nonzero()[:, 0] if idx.shape[0] > 2000: step = idx.shape[0] // 2000 latent_mu_sub = latent_mu[idx[::step], :] latent_logvar_sub = latent_logvar[idx[::step], :] else: latent_mu_sub = latent_mu[idx, :] latent_logvar_sub = latent_mu[idx, :] block_log_pz, block_log_qz, block_log_prod_qzi, block_log_q_zCx = _get_log_pz_qz_prodzi_qzCx( latent_sample.reshape(-1, d), (latent_mu, latent_logvar), (latent_mu_sub, latent_logvar_sub)) block_mi_loss = torch.sum( (block_log_q_zCx - block_log_qz) * mask) / torch.sum(mask) block_tc_loss = torch.sum( (block_log_qz - block_log_prod_qzi) * mask) / torch.sum(mask) block_kl_loss = torch.sum( (block_log_prod_qzi - block_log_pz) * mask) / torch.sum(mask) ### Junwen: compute the log prob terms for each 24*24 block latent_sample_2T = latent_sample.reshape(batch_size, seq_len * d).unfold( 1, 2 * T * d, d).reshape(-1, 2 * T * d) latent_mu = latent_mu.reshape(batch_size, seq_len * d).unfold( 1, 2 * T * d, d).reshape(-1, 2 * T * d) latent_logvar = latent_logvar.reshape(batch_size, seq_len * d).unfold( 1, 2 * T * d, d).reshape(-1, 2 * T * d) mask = latent_mask.reshape(batch_size, seq_len).unfold( 1, 2 * T, 1).reshape(-1, 2 * T).all(1).float() log_q_zCx = log_density_gaussian(latent_sample_2T, latent_mu, latent_logvar).sum(1) mvn = MVN(torch.zeros(2 * T * d, device=cov.device), covariance_matrix=cov) latent_sample_2T = post_L(latent_sample_2T) log_pz = mvn.log_prob(latent_sample_2T) kl_loss = torch.sum((log_q_zCx - log_pz) * mask) / torch.sum(mask) # the choice of the losses could be arbitrary combination of diff terms for diff-size blocks loss = alpha * block_mi_loss + beta * block_tc_loss + gamma * block_kl_loss + zeta * kl_loss print( "vae losses: block_mi_loss=%f, block_tc_loss=%f, block_kl_loss=%f, kl_loss=%f" % (block_mi_loss, block_tc_loss, block_kl_loss, kl_loss)) return loss
def two_gaussians(n, covariance=[1, 0, 0, 1]): sampler = MVN(loc=torch.zeros(2), covariance_matrix=torch.Tensor( [covariance[0:2], covariance[2:4]])) X, Y = sampler.sample((n, )).t() return X.view(-1, 1), Y.view(-1, 1)