Exemplo n.º 1
0
with torch.autograd.set_detect_anomaly(True):
    net.train(X_tr,
              T_tr,
              scheduler=scheduler,
              epochs=args.epochs,
              lr=0.1,
              warm_up=10,
              MC_samples=10)

# Inference in the model
prediction_train = net.predictive(X_tr, predictive_samples)
prediction_test = net.predictive(X_te, predictive_samples)

# Some evaluation Metrics
ECEtrain, _, _, _ = compute_calibration_measures(prediction_train,
                                                 T_tr,
                                                 apply_softmax=False,
                                                 bins=15)
ECEtest, _, _, _ = compute_calibration_measures(prediction_test,
                                                T_te,
                                                apply_softmax=False,
                                                bins=15)
ACCtrain = (float(
    (prediction_train.argmax(1) == T_tr).sum()) * 100.) / float(T_tr.size(0))
ACCtest = (float(
    (prediction_test.argmax(1) == T_te).sum()) * 100.) / float(T_te.size(0))

print("VI BNN net \t train acc {:.3f} \t test acc {:.3f}".format(
    ACCtrain, ACCtest))
print("ECE train {:.3f} ECE test {:.3f}".format(ECEtrain, ECEtest))

# Plot Decision Thresholds learn by the model
Exemplo n.º 2
0
    def test_log_likelihood(self, X: torch.tensor, Y:torch.tensor, return_moments:bool ,Y_std: float, S_MC_NNet: int = None) -> torch.tensor:
        """ Computes Predictive Log Likelihood 
                \log p(Y*|X*) = \log \int p(y*|G(f*),C_y) q(f*,f|u) q(u) df*,df,du 
                   -> We take diagonal of C_Y as samples are assumed to be i.i.d
                   -> Integration can be approximated either with Monte Carlo or with quadrature. This function uses quadrature.
                
                Args:
                        `X`                 (torch.tensor) :->: Input locations. Shape (MB,Dx) or shape (Dy,MB,Dx)
                        `Y`                 (torch.tensor) :->: Ground truth labels. Shape (MB,Dy)
                        `return_moments`    (bool)         :->: If true, then return the moments 1 and 2 from the predictive distribution.
                        `Y_std`             (float)        :->: Standard desviation of your regressed variable. Used to re-scale output.
                        `S_MC_NNet`         (int)          :->: Number of samples from the dropout distribution is fully_bayesian is true

                Returns:
                        `log_p_y`           (torch.tensor) :->: Log probability of each of the outpus with a tensor of shape (Dy,)
                        `predictive_params` (list)         :->: if return_moments True then returns a list with mean and variance from the predictive distribution. This is done in this funciton
                                                                because for some test log likelihood we need to compute the predictive. Hence support is given for any likelihood. Moments have shape
                                                                (Dy,MB,1)
        """
        MB = X.size(0)
        Dx = X.size(1)
        Dy = self.out_dim
        
        X_run  = X  # didnt realized the rest of function used X_run, so it is easier to do it here.
        if len(X_run.shape) == 2:
            X_run = X_run.repeat(self.out_dim,1,1) 
        assert len(X_run.shape) == 3, 'Invalid input X.shape'

        self.eval() # set parameters for eval mode. Batch normalization, dropout etc
        if self.fully_bayesian:
            # activate dropout if required
            is_dropout = enable_eval_dropout(self.modules())
            assert is_dropout, "You set the model to work on fully bayesian but there are no dropout layers in your model. I assert this error as otherwise the the code will work in non_bayesian operating mode"
            assert S_MC_NNet is not None, "The default parameter S_MC_NNet is not provided and set to default None, which is invalid for self.be_bayesian" 

        with torch.no_grad():

            ## ================================================ ##
            ## =========== GAUSSIAN LIKELIHOOOD =============== ##
            ## == with non linear mean
            if isinstance(self.likelihood,GaussianNonLinearMean):
                # retrieve the noise and expand
                log_var_noise = self.likelihood.log_var_noise
                if self.likelihood.noise_is_shared:
                    log_var_noise = self.likelihood.log_var_noise.expand(Dy,1)

                ## ================================================== ##
                ## === Compute moments of predictive distribution === ##
                #  In this model this is not necessary to compute log likelihood.
                #  However, we give the option of returning this parameters to be consistent
                #  with the standard GP.
                predictive_params = None
                if return_moments:
                    m1,m2, mean_q_f, cov_q_f = self.predictive_distribution(X_run, diagonal = True, S_MC_NNet = S_MC_NNet)
                    predictive_params = [m1,m2]
                else:
                    mean_q_f, cov_q_f = self.marginal_variational_qf_parameters(X_run, diagonal = True, is_duvenaud = False, init_Z = None)
                mean_q_f, cov_q_f = mean_q_f.squeeze(dim = -1),cov_q_f.squeeze(dim = -1)

                self.eval()
                if self.fully_bayesian:
                    ## Call again self.eval() as self.predictive_distribution call self.train() before return
                    is_dropout = enable_eval_dropout(self.modules())
                    assert is_dropout, "You set the model to work on fully bayesian but there are no dropout layers in your model. I assert this error as otherwise the the code will work in non_bayesian operating mode"

                ## Common functions used by bayesian and non bayesian flows
                def get_quad_weights_shifted_locations(mean_q_f,cov_q_f):
                    ## Get the quadrature points and the weights
                    locations = self.likelihood.quadrature_distribution.locations
                    locations = _pad_with_singletons(locations, num_singletons_before=0, num_singletons_after = mean_q_f.dim())
                    shifted_locs = torch.sqrt(2.0 * cov_q_f) * locations + mean_q_f # Shape (S_quad,Dy,S,MB)

                    weights = self.likelihood.quadrature_distribution.weights
                    weights = _pad_with_singletons(weights, num_singletons_before=0, num_singletons_after = shifted_locs.dim() - 1) # Shape (S_quad,1,1,1)

                    return shifted_locs, weights

                def compute_log_lik(Y,Y_std,shifted_locs,C_Y):
                    ## Re-scale by Y_std same as what other people does to compare in UCI
                    Y   = Y_std*Y
                    m_Y = Y_std*shifted_locs
                    C_Y = (Y_std*torch.sqrt(C_Y))**2

                    log_p_y = batched_log_Gaussian( Y, m_Y, C_Y, diagonal = True, cov_is_inverse = False) # (S_quad,Dy,S_MC,MB)
                    
                    return log_p_y

                S_MC_NNet = 1 if not self.fully_bayesian else S_MC_NNet # Note that the estimator is the same for input dependent and Bayesian. Just need to expand or not this dimension
                                                                        
                S_quad = self.quad_points 
                G_mat  = self.G_matrix

                # noise retrieve and reshape
                C_Y = torch.exp(log_var_noise).expand(-1,MB).view(Dy,1,MB,1).repeat((S_quad,1,S_MC_NNet,1,1)) # (Squad,Dy,S_MC_NNet,MB,1). Add extra dimension 1 so that we can compute 
                                                                                                                  #                           likelihood using batched_log_gaussian function    
                # observation reshape
                Y = Y.t().view(1,Dy,1,MB,1).repeat((S_quad,1,S_MC_NNet,1,1))   # S,Dy,S_MC_NNet,MB,1

                # Y_std reshape
                Y_std = Y_std.view(1,Dy,1,1,1).repeat(S_quad,1,S_MC_NNet,MB,1) # S,Dy,S_MC_NNet,MB,1

                # this operation could be done by repeating X and computing mean_q_f as in DGP but is not necessary to do extra computation here as X is constant: just repeat. 
                mean_q_f, cov_q_f = mean_q_f.unsqueeze(dim = 1),cov_q_f.unsqueeze(dim = 1) # Remove last dimension, so that we can warp. We add it later for batched_log_lik
                mean_q_f = mean_q_f.repeat(1,S_MC_NNet,1) # (Dy,S_MC_NNet,MB)
                cov_q_f  = cov_q_f.repeat(1,S_MC_NNet,1)

                ## =================================== ##
                ## === Compute test log likelihood === ##
                shifted_locs, weights =  get_quad_weights_shifted_locations(mean_q_f,cov_q_f)

                ## Warp quadrature points
                # expand X to perform MC dropout over NNets parameters
                X_run = X_run.unsqueeze(dim = 1).repeat(1,S_MC_NNet,1,1) # Just add one extra dimension. No need for repeat for S_quad as pytorch automatically broadcasts. 
                                                                         # It is important to repeat over S_MC_NNet. In this way each forward thorugh X computes a different 
                                                                         # MC for the flow parameters. Otherwise pytorch would broadcast S_MC_NNet as well hence we would only 
                                                                         # be using one sample from the posterior over W.
                for idx,fl in enumerate(G_mat):
                     shifted_locs[:,idx,:,:] = fl(shifted_locs[:,idx,:,:],X_run[idx]) # (S_quad,Dy,S_MC_NNet,MB)

                shifted_locs = shifted_locs.view(S_quad,Dy,S_MC_NNet,MB,1) # shape (S_quad,Dy,S,MB,1)

                log_p_y = compute_log_lik(Y,Y_std,shifted_locs,C_Y)

                if self.fully_bayesian: # the only difference between bayesian and the rest is here, where we perform a double integration for this case

                    # Reduce with double logsumexp operation. Check estimator here: @TODO: add link once we releasea github
                    reduce_lse = torch.log(weights)  + log_p_y
                    log_p_y = torch.logsumexp( torch.logsumexp(reduce_lse, dim = 0) -0.5*torch.log(cg.pi) ,dim = 1).sum(1) - MB*numpy.log(S_MC_NNet)
                else:
                    # Note that we just need to remove the extra dimension we added for using the same code
                    log_p_y = log_p_y.squeeze(dim = 2)
                    weights = weights.squeeze(dim = 2)
        
                    ## Reduce log ws + log_p_y_s using logsumexp trick. Also reduce MB and add the constant
                    reduce_lse = torch.log(weights) + log_p_y
                    log_p_y = (torch.logsumexp(reduce_lse, dim = 0)).sum(-1) - 0.5*MB*torch.log(cg.pi)

            ## ===================
            ## == with linear mean
            elif isinstance(self.likelihood,GaussianLinearMean):
                ## ================================================== ##
                ## === Compute moments of predictive distribution === ##
                m_Y,K_Y, mean_q_f, cov_q_f = self.predictive_distribution(X_run, diagonal = True)

                ## =================================== ##
                ## === Compute test log likelihood === ##
                # Re-scale Y_std
                Y = Y.t() # (Dy,MB)
                Y_std = Y_std.view(self.out_dim,1) # (Dy,1)

                log_p_y = batched_log_Gaussian( obs = Y_std*Y, mean = Y_std*m_Y, cov = (Y_std*torch.sqrt(K_Y))**2, diagonal = True, cov_is_inverse = False)

                predictive_params = None
                if return_moments:
                    predictive_params = [m_Y,K_Y]

            ## =============================================================== ##
            ## ============ BERNOULLI/CATEGORICAL LIKELIHOOOD ================ ##
            elif isinstance(self.likelihood,MulticlassCategorical) or isinstance(self.likelihood,Bernoulli):

                # as we cant do exact integration here either we warp or we dont the proceedure is very similar to GP classification. The only difference is of
                # binary classification with Gauss CDF link function
                m_Y, _, mean_q_f, cov_q_f = self.predictive_distribution(X_run,diagonal = True, S_MC_NNet = S_MC_NNet)

                check = torch.logical_not(torch.isfinite(m_Y)).float()
                assert check.sum() == 0.0, "Got saturated probabilities"

                if isinstance(self.likelihood,Bernoulli): # turn the vector as if it became from the MulticlassCategorical so that this is transparent to the trainer
                    m_Y     = m_Y.squeeze() 
                    neg_m_Y = 1.0-m_Y # compute the probability of class 0
                    m_Y     = torch.stack((neg_m_Y,m_Y),dim = 1) 

                _, _ , _ , log_p_y = compute_calibration_measures(m_Y.float() ,Y ,apply_softmax = False ,bins = 15)  

                log_p_y = -1*((log_p_y*MB).sum()) # the compute_calibration_measures returns log_p_y.mean(), hence we remove that by multiplying by MB and then summing up

                predictive_params = None
                if return_moments:
                    predictive_params = [m_Y]

            else:
                raise ValueError("Unsupported likelihood [{}] for class [{}]".format(type(self.likelihood),type(self)))

        self.train() # set parameters for train mode. Batch normalization, dropout etc
        return log_p_y, predictive_params
)  # posterior samples is used to integrate out the Bayesian warping functions.
# The number of monte carlo Samples used to integrate out likelihoods that cant
# be integrated with quadrature is given by config.quad_points. You can modify
# the e.g MulticlassCategoricalLikelihood to receive as argument this parameter

m1_te, _, _, _ = model.predictive_distribution(
    X=X_te, diagonal=True, S_MC_NNet=args.posterior_samples)

# for classification we use the first moment (second moment is nonsense): \int p(y| Link(f)) q(f) df = 1/S sum_s p(y | Link(f_s)) ; f_s ~ q(f)

# Some evaluation Metrics
T_tr = T_tr.view(-1)
T_te = T_te.view(-1)

ECEtrain, _, _, _ = compute_calibration_measures(m1_tr.float(),
                                                 T_tr,
                                                 apply_softmax=False,
                                                 bins=15)
ECEtest, _, _, _ = compute_calibration_measures(m1_te.float(),
                                                T_te,
                                                apply_softmax=False,
                                                bins=15)

ACCtrain = (float(
    (m1_tr.argmax(1) == T_tr).sum()) * 100.) / float(T_tr.size(0))
ACCtest = (float((m1_te.argmax(1) == T_te).sum()) * 100.) / float(T_te.size(0))

m = args.model if args.model != 'ID_TGP' else 'Point Estimate TGP'
print("{} \t train acc {:.3f} \t test acc {:.3f}".format(m, ACCtrain, ACCtest))
print("ECE train {:.3f} ECE test {:.3f}".format(ECEtrain, ECEtest))

## now compute bayesian predictions for input dependent