with torch.autograd.set_detect_anomaly(True): net.train(X_tr, T_tr, scheduler=scheduler, epochs=args.epochs, lr=0.1, warm_up=10, MC_samples=10) # Inference in the model prediction_train = net.predictive(X_tr, predictive_samples) prediction_test = net.predictive(X_te, predictive_samples) # Some evaluation Metrics ECEtrain, _, _, _ = compute_calibration_measures(prediction_train, T_tr, apply_softmax=False, bins=15) ECEtest, _, _, _ = compute_calibration_measures(prediction_test, T_te, apply_softmax=False, bins=15) ACCtrain = (float( (prediction_train.argmax(1) == T_tr).sum()) * 100.) / float(T_tr.size(0)) ACCtest = (float( (prediction_test.argmax(1) == T_te).sum()) * 100.) / float(T_te.size(0)) print("VI BNN net \t train acc {:.3f} \t test acc {:.3f}".format( ACCtrain, ACCtest)) print("ECE train {:.3f} ECE test {:.3f}".format(ECEtrain, ECEtest)) # Plot Decision Thresholds learn by the model
def test_log_likelihood(self, X: torch.tensor, Y:torch.tensor, return_moments:bool ,Y_std: float, S_MC_NNet: int = None) -> torch.tensor: """ Computes Predictive Log Likelihood \log p(Y*|X*) = \log \int p(y*|G(f*),C_y) q(f*,f|u) q(u) df*,df,du -> We take diagonal of C_Y as samples are assumed to be i.i.d -> Integration can be approximated either with Monte Carlo or with quadrature. This function uses quadrature. Args: `X` (torch.tensor) :->: Input locations. Shape (MB,Dx) or shape (Dy,MB,Dx) `Y` (torch.tensor) :->: Ground truth labels. Shape (MB,Dy) `return_moments` (bool) :->: If true, then return the moments 1 and 2 from the predictive distribution. `Y_std` (float) :->: Standard desviation of your regressed variable. Used to re-scale output. `S_MC_NNet` (int) :->: Number of samples from the dropout distribution is fully_bayesian is true Returns: `log_p_y` (torch.tensor) :->: Log probability of each of the outpus with a tensor of shape (Dy,) `predictive_params` (list) :->: if return_moments True then returns a list with mean and variance from the predictive distribution. This is done in this funciton because for some test log likelihood we need to compute the predictive. Hence support is given for any likelihood. Moments have shape (Dy,MB,1) """ MB = X.size(0) Dx = X.size(1) Dy = self.out_dim X_run = X # didnt realized the rest of function used X_run, so it is easier to do it here. if len(X_run.shape) == 2: X_run = X_run.repeat(self.out_dim,1,1) assert len(X_run.shape) == 3, 'Invalid input X.shape' self.eval() # set parameters for eval mode. Batch normalization, dropout etc if self.fully_bayesian: # activate dropout if required is_dropout = enable_eval_dropout(self.modules()) assert is_dropout, "You set the model to work on fully bayesian but there are no dropout layers in your model. I assert this error as otherwise the the code will work in non_bayesian operating mode" assert S_MC_NNet is not None, "The default parameter S_MC_NNet is not provided and set to default None, which is invalid for self.be_bayesian" with torch.no_grad(): ## ================================================ ## ## =========== GAUSSIAN LIKELIHOOOD =============== ## ## == with non linear mean if isinstance(self.likelihood,GaussianNonLinearMean): # retrieve the noise and expand log_var_noise = self.likelihood.log_var_noise if self.likelihood.noise_is_shared: log_var_noise = self.likelihood.log_var_noise.expand(Dy,1) ## ================================================== ## ## === Compute moments of predictive distribution === ## # In this model this is not necessary to compute log likelihood. # However, we give the option of returning this parameters to be consistent # with the standard GP. predictive_params = None if return_moments: m1,m2, mean_q_f, cov_q_f = self.predictive_distribution(X_run, diagonal = True, S_MC_NNet = S_MC_NNet) predictive_params = [m1,m2] else: mean_q_f, cov_q_f = self.marginal_variational_qf_parameters(X_run, diagonal = True, is_duvenaud = False, init_Z = None) mean_q_f, cov_q_f = mean_q_f.squeeze(dim = -1),cov_q_f.squeeze(dim = -1) self.eval() if self.fully_bayesian: ## Call again self.eval() as self.predictive_distribution call self.train() before return is_dropout = enable_eval_dropout(self.modules()) assert is_dropout, "You set the model to work on fully bayesian but there are no dropout layers in your model. I assert this error as otherwise the the code will work in non_bayesian operating mode" ## Common functions used by bayesian and non bayesian flows def get_quad_weights_shifted_locations(mean_q_f,cov_q_f): ## Get the quadrature points and the weights locations = self.likelihood.quadrature_distribution.locations locations = _pad_with_singletons(locations, num_singletons_before=0, num_singletons_after = mean_q_f.dim()) shifted_locs = torch.sqrt(2.0 * cov_q_f) * locations + mean_q_f # Shape (S_quad,Dy,S,MB) weights = self.likelihood.quadrature_distribution.weights weights = _pad_with_singletons(weights, num_singletons_before=0, num_singletons_after = shifted_locs.dim() - 1) # Shape (S_quad,1,1,1) return shifted_locs, weights def compute_log_lik(Y,Y_std,shifted_locs,C_Y): ## Re-scale by Y_std same as what other people does to compare in UCI Y = Y_std*Y m_Y = Y_std*shifted_locs C_Y = (Y_std*torch.sqrt(C_Y))**2 log_p_y = batched_log_Gaussian( Y, m_Y, C_Y, diagonal = True, cov_is_inverse = False) # (S_quad,Dy,S_MC,MB) return log_p_y S_MC_NNet = 1 if not self.fully_bayesian else S_MC_NNet # Note that the estimator is the same for input dependent and Bayesian. Just need to expand or not this dimension S_quad = self.quad_points G_mat = self.G_matrix # noise retrieve and reshape C_Y = torch.exp(log_var_noise).expand(-1,MB).view(Dy,1,MB,1).repeat((S_quad,1,S_MC_NNet,1,1)) # (Squad,Dy,S_MC_NNet,MB,1). Add extra dimension 1 so that we can compute # likelihood using batched_log_gaussian function # observation reshape Y = Y.t().view(1,Dy,1,MB,1).repeat((S_quad,1,S_MC_NNet,1,1)) # S,Dy,S_MC_NNet,MB,1 # Y_std reshape Y_std = Y_std.view(1,Dy,1,1,1).repeat(S_quad,1,S_MC_NNet,MB,1) # S,Dy,S_MC_NNet,MB,1 # this operation could be done by repeating X and computing mean_q_f as in DGP but is not necessary to do extra computation here as X is constant: just repeat. mean_q_f, cov_q_f = mean_q_f.unsqueeze(dim = 1),cov_q_f.unsqueeze(dim = 1) # Remove last dimension, so that we can warp. We add it later for batched_log_lik mean_q_f = mean_q_f.repeat(1,S_MC_NNet,1) # (Dy,S_MC_NNet,MB) cov_q_f = cov_q_f.repeat(1,S_MC_NNet,1) ## =================================== ## ## === Compute test log likelihood === ## shifted_locs, weights = get_quad_weights_shifted_locations(mean_q_f,cov_q_f) ## Warp quadrature points # expand X to perform MC dropout over NNets parameters X_run = X_run.unsqueeze(dim = 1).repeat(1,S_MC_NNet,1,1) # Just add one extra dimension. No need for repeat for S_quad as pytorch automatically broadcasts. # It is important to repeat over S_MC_NNet. In this way each forward thorugh X computes a different # MC for the flow parameters. Otherwise pytorch would broadcast S_MC_NNet as well hence we would only # be using one sample from the posterior over W. for idx,fl in enumerate(G_mat): shifted_locs[:,idx,:,:] = fl(shifted_locs[:,idx,:,:],X_run[idx]) # (S_quad,Dy,S_MC_NNet,MB) shifted_locs = shifted_locs.view(S_quad,Dy,S_MC_NNet,MB,1) # shape (S_quad,Dy,S,MB,1) log_p_y = compute_log_lik(Y,Y_std,shifted_locs,C_Y) if self.fully_bayesian: # the only difference between bayesian and the rest is here, where we perform a double integration for this case # Reduce with double logsumexp operation. Check estimator here: @TODO: add link once we releasea github reduce_lse = torch.log(weights) + log_p_y log_p_y = torch.logsumexp( torch.logsumexp(reduce_lse, dim = 0) -0.5*torch.log(cg.pi) ,dim = 1).sum(1) - MB*numpy.log(S_MC_NNet) else: # Note that we just need to remove the extra dimension we added for using the same code log_p_y = log_p_y.squeeze(dim = 2) weights = weights.squeeze(dim = 2) ## Reduce log ws + log_p_y_s using logsumexp trick. Also reduce MB and add the constant reduce_lse = torch.log(weights) + log_p_y log_p_y = (torch.logsumexp(reduce_lse, dim = 0)).sum(-1) - 0.5*MB*torch.log(cg.pi) ## =================== ## == with linear mean elif isinstance(self.likelihood,GaussianLinearMean): ## ================================================== ## ## === Compute moments of predictive distribution === ## m_Y,K_Y, mean_q_f, cov_q_f = self.predictive_distribution(X_run, diagonal = True) ## =================================== ## ## === Compute test log likelihood === ## # Re-scale Y_std Y = Y.t() # (Dy,MB) Y_std = Y_std.view(self.out_dim,1) # (Dy,1) log_p_y = batched_log_Gaussian( obs = Y_std*Y, mean = Y_std*m_Y, cov = (Y_std*torch.sqrt(K_Y))**2, diagonal = True, cov_is_inverse = False) predictive_params = None if return_moments: predictive_params = [m_Y,K_Y] ## =============================================================== ## ## ============ BERNOULLI/CATEGORICAL LIKELIHOOOD ================ ## elif isinstance(self.likelihood,MulticlassCategorical) or isinstance(self.likelihood,Bernoulli): # as we cant do exact integration here either we warp or we dont the proceedure is very similar to GP classification. The only difference is of # binary classification with Gauss CDF link function m_Y, _, mean_q_f, cov_q_f = self.predictive_distribution(X_run,diagonal = True, S_MC_NNet = S_MC_NNet) check = torch.logical_not(torch.isfinite(m_Y)).float() assert check.sum() == 0.0, "Got saturated probabilities" if isinstance(self.likelihood,Bernoulli): # turn the vector as if it became from the MulticlassCategorical so that this is transparent to the trainer m_Y = m_Y.squeeze() neg_m_Y = 1.0-m_Y # compute the probability of class 0 m_Y = torch.stack((neg_m_Y,m_Y),dim = 1) _, _ , _ , log_p_y = compute_calibration_measures(m_Y.float() ,Y ,apply_softmax = False ,bins = 15) log_p_y = -1*((log_p_y*MB).sum()) # the compute_calibration_measures returns log_p_y.mean(), hence we remove that by multiplying by MB and then summing up predictive_params = None if return_moments: predictive_params = [m_Y] else: raise ValueError("Unsupported likelihood [{}] for class [{}]".format(type(self.likelihood),type(self))) self.train() # set parameters for train mode. Batch normalization, dropout etc return log_p_y, predictive_params
) # posterior samples is used to integrate out the Bayesian warping functions. # The number of monte carlo Samples used to integrate out likelihoods that cant # be integrated with quadrature is given by config.quad_points. You can modify # the e.g MulticlassCategoricalLikelihood to receive as argument this parameter m1_te, _, _, _ = model.predictive_distribution( X=X_te, diagonal=True, S_MC_NNet=args.posterior_samples) # for classification we use the first moment (second moment is nonsense): \int p(y| Link(f)) q(f) df = 1/S sum_s p(y | Link(f_s)) ; f_s ~ q(f) # Some evaluation Metrics T_tr = T_tr.view(-1) T_te = T_te.view(-1) ECEtrain, _, _, _ = compute_calibration_measures(m1_tr.float(), T_tr, apply_softmax=False, bins=15) ECEtest, _, _, _ = compute_calibration_measures(m1_te.float(), T_te, apply_softmax=False, bins=15) ACCtrain = (float( (m1_tr.argmax(1) == T_tr).sum()) * 100.) / float(T_tr.size(0)) ACCtest = (float((m1_te.argmax(1) == T_te).sum()) * 100.) / float(T_te.size(0)) m = args.model if args.model != 'ID_TGP' else 'Point Estimate TGP' print("{} \t train acc {:.3f} \t test acc {:.3f}".format(m, ACCtrain, ACCtest)) print("ECE train {:.3f} ECE test {:.3f}".format(ECEtrain, ECEtest)) ## now compute bayesian predictions for input dependent