def set_embedding(self, name: str, vector: torch.tensor): device = flair.device if len(self._embeddings.keys()) > 0: device = next(iter(self._embeddings.values())).device self._embeddings[name] = vector.to(device, non_blocking=True)
def marginal_variational_qf_parameters(self, X : torch.tensor, diagonal : bool, is_duvenaud: bool, init_Z : torch.tensor = None) -> torch.tensor: """ Marginal Variational posterior q(f) = \int p(f|u) q(u) d_u q(f) = int p(f|u) q(u) d_u = N(f|K_xz K_zz_inv m + m_x -K_xz K_zz_inv \mu_z, K_xx -K_xz K_zz_inv K_zx + [K_xz K_zz_inv] S [K_xz K_zz_inv]^T) Args: `X` (torch.tensor) :->: input locations where the marginal distribution q(f) is computed. Can hace shape (S*MB,Dx) or (Dy,S*MB,Dx) `diagonal` (bool) :->: If true, return only the diagonal covariance `is_duvenaud` (bool) :->: Indicate if we are using duvenaud mean function. Only useful in DGPs `init_Z` (torch.tensor) :->: Only used if is_duvenaud = True. It is used to concatenate the input inducing points to the inducing points at each layer Returns: `mu_q_f` (torch.tensor) :->: shape Dy,MB,1 `cov_q_f` (torch.tensor) :->: shape (Dy,MB,1) if diagonal else (Dy,MB,MB) """ ## ================================= ## ## ===== Pre-Compute Variables ===== ## if len(X.shape) == 2: X = X.repeat(self.out_dim,1,1) # repeat here as if not this operation will be done twice by the marginal_qf_parameter and likelihood to work batched and multioutput respectively assert len(X.shape) == 3, 'Invalid input X.shape' Dy,MB,M = self.out_dim,X.size(1),self.M Z = self.Z kernel = self.covariance_function mean = self.mean_function if self.Z_is_shared: # In this case this repeat is not particulary needed because the kernel will repeat Z # when doing forward both if batch_shape is out_dim or is 1 (self.kernel_is_shared True) # Keep it explicitely for better understanding of the code. Z = Z.repeat(self.out_dim,1,1) # Concatenate inducing points if is duvenaud if is_duvenaud: #z_concat = X[0,0:self.M,-1].view(self.M,1) init_Z = init_Z.view(1,self.M,-1).repeat(self.out_dim,1,1) Z = torch.cat((Z,init_Z),2) K_xx = kernel(X,are_equal = True, diag = diagonal) mu_x = gpy.lazy.delazify( mean(X) ).view(Dy, MB, 1) K_zz = kernel(Z,are_equal = False).evaluate() mu_z = gpy.lazy.delazify( mean(Z) ).view(Dy, M , 1) K_xz = kernel(X,Z,are_equal = False).evaluate() # stabilize K_xz. In case Z = X we should add jitter if psd_safe_cholesky adds jitter to K_zz # jitter can only be added to square matrices K_zx = torch.transpose(K_xz,1,2) # pre-compute the transpose as it is required several times # cholesky from K_zz L_zz, K_zz = psd_safe_cholesky(K_zz, upper = False, jitter = cg.global_jitter) # The K_zz returned is that with noise if self.is_whiten: L_zz_t = L_zz.transpose(1,2) # variational distribution q_U = self.q_U m_q_U = q_U.variational_mean K_q_U = q_U.chol_variational_covar lower_mask = torch.ones(K_q_U.shape[-2:], dtype=cg.dtype, device=cg.device).tril(0) L_q_U = K_q_U.mul(lower_mask) K_q_U = torch.matmul( L_q_U,L_q_U.transpose(1,2) ) m_q_U = m_q_U.view(Dy,M,-1) ## =================== ## ## ==== mean q(f) ==== ## if self.is_whiten: # mu_qf = K_{xz}[L_{zz}^T]^{-1}m_0+\mu_x sol,_ = torch.triangular_solve(m_q_U, L_zz_t, upper = True) mu_q_f = torch.matmul(K_xz,sol) + mu_x else: # mu_qf = K_xz K_zz_inv( m - mu_Z) + m_x lhs = torch.cholesky_solve(m_q_U-mu_z, L_zz, upper = False) mu_q_f = torch.matmul(K_xz,lhs) + mu_x ## ========================= ## ## ==== covariance q(f) ==== ## ## Note: # To compute the diagonal q(f) we perform the following identity. Here @ indicates matrix product and .* element-wise product # For K_xz @ K_zz_inv @ K_zx the diagonal is: # sum(K_zx .* [K_zz_inv @ K_zx],0) # This means that the identity can be written down as: # A @ B @ A^T = A^T .* [ B @ A^T ] # For the covariance note that: [K_xz K_zz_inv] S [K_xz K_zz_inv]^T = [K_zz_inv K_zx]^T S [K_zz_inv K_zx] = # where the output of the linear solver is sol = [K_zz_inv K_zx]. So we have: sol^T S sol. Thus we have: sum(sol.*[S @ sol],0) to compute the diagonal # note that as the operations are batched we have to reduce dimension 1 instead of dimension 0. Also use matmul to perform the batched operation. # sol = K_zz^{-1}@K_zx sol = torch.cholesky_solve(K_zx, L_zz, upper = False) if self.is_whiten: # cov_qf = K_{xx} -K_{xz} K_{zz}^{-1} K_{zx} + K_{xz} {L_{zz}^T}^{-1} S L_{zz}^{-1}K_{zx} rhs,_ = torch.triangular_solve(K_zx, L_zz, upper = False) if diagonal: cov_q_f = K_xx - torch.sum(torch.mul(K_zx,sol),1) + torch.sum(torch.mul(rhs,torch.matmul(K_q_U,rhs)),1) else: cov_q_f = K_xx - torch.matmul(K_xz,sol) + torch.matmul(torch.matmul(torch.transpose(rhs,1,2),K_q_U),rhs) else: # cov_qf = K_{xx} -K_{xz} K_{zz}^{-1} K_{zx} + [K_{xz} K_{zz}^{-1}] S [K_{xz} K_{zz}^{-1}]^T if diagonal: cov_q_f = K_xx - torch.sum(torch.mul(K_zx,sol),1) + torch.sum(torch.mul(sol,torch.matmul(K_q_U,sol)),1) else: cov_q_f = K_xx - torch.matmul(K_xz,sol) + torch.matmul(torch.matmul(torch.transpose(sol,1,2),K_q_U),sol) if diagonal: cov_q_f = torch.unsqueeze(cov_q_f,2) return mu_q_f, cov_q_f
def test_log_likelihood(self, X: torch.tensor, Y:torch.tensor, return_moments:bool ,Y_std: float, S_MC_NNet: int = None) -> torch.tensor: """ Computes Predictive Log Likelihood \log p(Y*|X*) = \log \int p(y*|G(f*),C_y) q(f*,f|u) q(u) df*,df,du -> We take diagonal of C_Y as samples are assumed to be i.i.d -> Integration can be approximated either with Monte Carlo or with quadrature. This function uses quadrature. Args: `X` (torch.tensor) :->: Input locations. Shape (MB,Dx) or shape (Dy,MB,Dx) `Y` (torch.tensor) :->: Ground truth labels. Shape (MB,Dy) `return_moments` (bool) :->: If true, then return the moments 1 and 2 from the predictive distribution. `Y_std` (float) :->: Standard desviation of your regressed variable. Used to re-scale output. `S_MC_NNet` (int) :->: Number of samples from the dropout distribution is fully_bayesian is true Returns: `log_p_y` (torch.tensor) :->: Log probability of each of the outpus with a tensor of shape (Dy,) `predictive_params` (list) :->: if return_moments True then returns a list with mean and variance from the predictive distribution. This is done in this funciton because for some test log likelihood we need to compute the predictive. Hence support is given for any likelihood. Moments have shape (Dy,MB,1) """ MB = X.size(0) Dx = X.size(1) Dy = self.out_dim X_run = X # didnt realized the rest of function used X_run, so it is easier to do it here. if len(X_run.shape) == 2: X_run = X_run.repeat(self.out_dim,1,1) assert len(X_run.shape) == 3, 'Invalid input X.shape' self.eval() # set parameters for eval mode. Batch normalization, dropout etc if self.fully_bayesian: # activate dropout if required is_dropout = enable_eval_dropout(self.modules()) assert is_dropout, "You set the model to work on fully bayesian but there are no dropout layers in your model. I assert this error as otherwise the the code will work in non_bayesian operating mode" assert S_MC_NNet is not None, "The default parameter S_MC_NNet is not provided and set to default None, which is invalid for self.be_bayesian" with torch.no_grad(): ## ================================================ ## ## =========== GAUSSIAN LIKELIHOOOD =============== ## ## == with non linear mean if isinstance(self.likelihood,GaussianNonLinearMean): # retrieve the noise and expand log_var_noise = self.likelihood.log_var_noise if self.likelihood.noise_is_shared: log_var_noise = self.likelihood.log_var_noise.expand(Dy,1) ## ================================================== ## ## === Compute moments of predictive distribution === ## # In this model this is not necessary to compute log likelihood. # However, we give the option of returning this parameters to be consistent # with the standard GP. predictive_params = None if return_moments: m1,m2, mean_q_f, cov_q_f = self.predictive_distribution(X_run, diagonal = True, S_MC_NNet = S_MC_NNet) predictive_params = [m1,m2] else: mean_q_f, cov_q_f = self.marginal_variational_qf_parameters(X_run, diagonal = True, is_duvenaud = False, init_Z = None) mean_q_f, cov_q_f = mean_q_f.squeeze(dim = -1),cov_q_f.squeeze(dim = -1) self.eval() if self.fully_bayesian: ## Call again self.eval() as self.predictive_distribution call self.train() before return is_dropout = enable_eval_dropout(self.modules()) assert is_dropout, "You set the model to work on fully bayesian but there are no dropout layers in your model. I assert this error as otherwise the the code will work in non_bayesian operating mode" ## Common functions used by bayesian and non bayesian flows def get_quad_weights_shifted_locations(mean_q_f,cov_q_f): ## Get the quadrature points and the weights locations = self.likelihood.quadrature_distribution.locations locations = _pad_with_singletons(locations, num_singletons_before=0, num_singletons_after = mean_q_f.dim()) shifted_locs = torch.sqrt(2.0 * cov_q_f) * locations + mean_q_f # Shape (S_quad,Dy,S,MB) weights = self.likelihood.quadrature_distribution.weights weights = _pad_with_singletons(weights, num_singletons_before=0, num_singletons_after = shifted_locs.dim() - 1) # Shape (S_quad,1,1,1) return shifted_locs, weights def compute_log_lik(Y,Y_std,shifted_locs,C_Y): ## Re-scale by Y_std same as what other people does to compare in UCI Y = Y_std*Y m_Y = Y_std*shifted_locs C_Y = (Y_std*torch.sqrt(C_Y))**2 log_p_y = batched_log_Gaussian( Y, m_Y, C_Y, diagonal = True, cov_is_inverse = False) # (S_quad,Dy,S_MC,MB) return log_p_y S_MC_NNet = 1 if not self.fully_bayesian else S_MC_NNet # Note that the estimator is the same for input dependent and Bayesian. Just need to expand or not this dimension S_quad = self.quad_points G_mat = self.G_matrix # noise retrieve and reshape C_Y = torch.exp(log_var_noise).expand(-1,MB).view(Dy,1,MB,1).repeat((S_quad,1,S_MC_NNet,1,1)) # (Squad,Dy,S_MC_NNet,MB,1). Add extra dimension 1 so that we can compute # likelihood using batched_log_gaussian function # observation reshape Y = Y.t().view(1,Dy,1,MB,1).repeat((S_quad,1,S_MC_NNet,1,1)) # S,Dy,S_MC_NNet,MB,1 # Y_std reshape Y_std = Y_std.view(1,Dy,1,1,1).repeat(S_quad,1,S_MC_NNet,MB,1) # S,Dy,S_MC_NNet,MB,1 # this operation could be done by repeating X and computing mean_q_f as in DGP but is not necessary to do extra computation here as X is constant: just repeat. mean_q_f, cov_q_f = mean_q_f.unsqueeze(dim = 1),cov_q_f.unsqueeze(dim = 1) # Remove last dimension, so that we can warp. We add it later for batched_log_lik mean_q_f = mean_q_f.repeat(1,S_MC_NNet,1) # (Dy,S_MC_NNet,MB) cov_q_f = cov_q_f.repeat(1,S_MC_NNet,1) ## =================================== ## ## === Compute test log likelihood === ## shifted_locs, weights = get_quad_weights_shifted_locations(mean_q_f,cov_q_f) ## Warp quadrature points # expand X to perform MC dropout over NNets parameters X_run = X_run.unsqueeze(dim = 1).repeat(1,S_MC_NNet,1,1) # Just add one extra dimension. No need for repeat for S_quad as pytorch automatically broadcasts. # It is important to repeat over S_MC_NNet. In this way each forward thorugh X computes a different # MC for the flow parameters. Otherwise pytorch would broadcast S_MC_NNet as well hence we would only # be using one sample from the posterior over W. for idx,fl in enumerate(G_mat): shifted_locs[:,idx,:,:] = fl(shifted_locs[:,idx,:,:],X_run[idx]) # (S_quad,Dy,S_MC_NNet,MB) shifted_locs = shifted_locs.view(S_quad,Dy,S_MC_NNet,MB,1) # shape (S_quad,Dy,S,MB,1) log_p_y = compute_log_lik(Y,Y_std,shifted_locs,C_Y) if self.fully_bayesian: # the only difference between bayesian and the rest is here, where we perform a double integration for this case # Reduce with double logsumexp operation. Check estimator here: @TODO: add link once we releasea github reduce_lse = torch.log(weights) + log_p_y log_p_y = torch.logsumexp( torch.logsumexp(reduce_lse, dim = 0) -0.5*torch.log(cg.pi) ,dim = 1).sum(1) - MB*numpy.log(S_MC_NNet) else: # Note that we just need to remove the extra dimension we added for using the same code log_p_y = log_p_y.squeeze(dim = 2) weights = weights.squeeze(dim = 2) ## Reduce log ws + log_p_y_s using logsumexp trick. Also reduce MB and add the constant reduce_lse = torch.log(weights) + log_p_y log_p_y = (torch.logsumexp(reduce_lse, dim = 0)).sum(-1) - 0.5*MB*torch.log(cg.pi) ## =================== ## == with linear mean elif isinstance(self.likelihood,GaussianLinearMean): ## ================================================== ## ## === Compute moments of predictive distribution === ## m_Y,K_Y, mean_q_f, cov_q_f = self.predictive_distribution(X_run, diagonal = True) ## =================================== ## ## === Compute test log likelihood === ## # Re-scale Y_std Y = Y.t() # (Dy,MB) Y_std = Y_std.view(self.out_dim,1) # (Dy,1) log_p_y = batched_log_Gaussian( obs = Y_std*Y, mean = Y_std*m_Y, cov = (Y_std*torch.sqrt(K_Y))**2, diagonal = True, cov_is_inverse = False) predictive_params = None if return_moments: predictive_params = [m_Y,K_Y] ## =============================================================== ## ## ============ BERNOULLI/CATEGORICAL LIKELIHOOOD ================ ## elif isinstance(self.likelihood,MulticlassCategorical) or isinstance(self.likelihood,Bernoulli): # as we cant do exact integration here either we warp or we dont the proceedure is very similar to GP classification. The only difference is of # binary classification with Gauss CDF link function m_Y, _, mean_q_f, cov_q_f = self.predictive_distribution(X_run,diagonal = True, S_MC_NNet = S_MC_NNet) check = torch.logical_not(torch.isfinite(m_Y)).float() assert check.sum() == 0.0, "Got saturated probabilities" if isinstance(self.likelihood,Bernoulli): # turn the vector as if it became from the MulticlassCategorical so that this is transparent to the trainer m_Y = m_Y.squeeze() neg_m_Y = 1.0-m_Y # compute the probability of class 0 m_Y = torch.stack((neg_m_Y,m_Y),dim = 1) _, _ , _ , log_p_y = compute_calibration_measures(m_Y.float() ,Y ,apply_softmax = False ,bins = 15) log_p_y = -1*((log_p_y*MB).sum()) # the compute_calibration_measures returns log_p_y.mean(), hence we remove that by multiplying by MB and then summing up predictive_params = None if return_moments: predictive_params = [m_Y] else: raise ValueError("Unsupported likelihood [{}] for class [{}]".format(type(self.likelihood),type(self))) self.train() # set parameters for train mode. Batch normalization, dropout etc return log_p_y, predictive_params
def cube(self,tensor:torch.tensor): return tensor.mul(tensor.mul(tensor))
def label_smoothing(y: torch.tensor, alpha: float) -> torch.tensor: return y.float() * (1 - alpha) + 0.5 * alpha
def valid_loss_compute(self, x: torch.tensor, y: torch.tensor, norm: int): x = self.model.generator(x) loss = self.labelsmooth(x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)) / norm return loss.item() * norm
def forward(self, x: torch.tensor): mean = x.mean(-1, keepdim=True) std = x.std(-1, keepdim=True) return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_error, local_rank): # all_start_time = time.time() original_shape = buffer_m.size() if len(original_shape) > 1: buffer_m = torch.flatten(buffer_m) original_size = buffer_m.numel() worker_error_size = worker_error.numel() cupy.cuda.Device(local_rank).use() if original_size != worker_error_size: empty_tensor = torch.zeros(worker_error_size - original_size, device=buffer_m.device) buffer_m = torch.cat([buffer_m, empty_tensor]) buffer_m.add_(worker_error) worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m)) worker_error.set_(buffer_m - worker_scale * buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) if self.bool_not_supported: cupy_sign_list_packed = self.compression_backend.compress_by_chunk( self.compression_backend.torch2cupy( buffer_m.sign_().add_(1).bool().to(dtype=torch.uint8)), self.size) else: cupy_sign_list_packed = self.compression_backend.compress_by_chunk( self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()), self.size) cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale) cupy_recvbuf_sign = cupy.zeros( [self.size, cupy_sign_list_packed[self.rank].size], dtype=cupy_sign_list_packed[0].dtype) # cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) sign_list_packed = [ self.compression_backend.cupy2torch(cupy_sign_list_packed[idx]) for idx in range(self.size) ] # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale) recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign) #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale) recvbuf_scale = [ torch.zeros(1, dtype=worker_scale.dtype, device=torch.device(local_rank)) for i in range(self.size) ] # communication phase 1 # gather_start = time.time() # Alltoall for sign dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed), group=self.world_group) # Allgather for scale dist.all_gather(recvbuf_scale, worker_scale, group=self.world_group) # gather_end = time.time() # cupy_sign_list_packed, sign_list_packed, cupy_worker_scale, worker_scale = None, None, None, None cupy_sign_list_packed = None cupy_recvbuf_sign = self.compression_backend.torch2cupy(recvbuf_sign) #cupy_recvbuf_scale = self.compression_backend.torch2cupy(torch.stack(recvbuf_scale)) compensated_server_m = self.compression_backend.cupy2torch( (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( self.size, -1)).float().add_(-0.5).mul_(2.0).mul_( torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0) compensated_server_m.add_(server_error) server_scale = torch.norm(compensated_server_m) / np.sqrt( compensated_server_m.numel()) server_error.set_( compensated_server_m - server_scale * compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) # cupy_server_scale = self.compression_backend.torch2cupy(server_scale) if self.bool_not_supported: cupy_server_sign_packed = self.compression_backend.compress_by_chunk( self.compression_backend.torch2cupy( compensated_server_m.sign_().add_(1).bool().to(dtype=torch.uint8)), 1) else: cupy_server_sign_packed = self.compression_backend.compress_by_chunk( self.compression_backend.torch2cupy( compensated_server_m.sign_().add_(1).bool()), 1) compensated_server_m = None cupy_recvbuf_sign_server = cupy.zeros( [self.size, cupy_server_sign_packed[0].size], dtype=cupy_recvbuf_sign.dtype) # cupy_recvbuf_sign, recvbuf_sign = None, None cupy_recvbuf_sign = None server_sign_packed = [ self.compression_backend.cupy2torch(cupy_server_sign_packed[0]) ] recvbuf_sign_server = [ self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx]) for idx in range(self.size) ] # server_scale = self.compression_backend.cupy2torch(cupy_server_scale) cupy_recvbuf_scale_server = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) # cupy_recvbuf_scale, recvbuf_scale = None, None recvbuf_scale_server = [ self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx]) for idx in range(self.size) ] # Communication Phase 2 dist.all_gather(recvbuf_sign_server, server_sign_packed[0], group=self.world_group) dist.all_gather(recvbuf_scale_server, server_scale, group=self.world_group) cupy_server_sign_packed = None # need to convert from a tensor list to a single tensor # dist.all_gather only provides a tensor list as the recv/output buffer recvbuf_sign_server = torch.stack(recvbuf_sign_server) cupy_recvbuf_sign_server = self.compression_backend.torch2cupy( recvbuf_sign_server) buffer_m.data.copy_( self.compression_backend.cupy2torch( (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape( self.size, -1)).float().add_(-0.5).mul_(2.0).mul_( self.compression_backend.cupy2torch( cupy_recvbuf_scale_server)).flatten().data) if original_size != worker_error_size: buffer_m = buffer_m[0:original_size] if len(original_shape) > 1: buffer_m = buffer_m.reshape(original_shape) return buffer_m
def form_real(number: torch.tensor) -> str: return "{:.3f}".format(number.item())
def clipping_weight(self, weight: torch.tensor) -> torch.tensor: with torch.set_grad_enabled(False): weight = torch.clamp(weight, -1, 1) weight.requires_grad = True return weight
def conditional_to_cuda(x: torch.tensor, non_blocking: bool = False) -> torch.tensor: #print(x.cuda.__doc__) #return x.cuda(non_blocking=non_blocking) if args.gpu_count > 0 else x return x.cuda() if args.gpu_count > 0 else x
def deterministic(weight: torch.tensor) -> torch.tensor: return weight.sign()
def _hook_properties(hook_self, tensor_type: torch.tensor): """Overloads tensor_type properties Parameters: tensor_type: Torch tensor """ @property def child(self): try: try: assert self._child is not None return self._child except (AttributeError, AssertionError): self._child = _LocalTensor(child=self, parent=self, torch_type=type(self).__name__) return self._child except TypeError: # for some reason, hasattr(self, '_child') returns a TypeError saying # "TypeError: 'NoneType' object is not callable". It's supposed to only # return False and I can't get to the bottom of it. So, for now, I'm # going to break a personal rule and use try/catch for logic, but # this is merely supposed to evaluate whether self has ._child as an # attribute. Note this only seems to happen when self is a # torch.autograd.Variable self._child = _LocalTensor(child=self, parent=self, torch_type=type(self).__name__) return self._child @child.setter def child(self, value): self._child = value tensor_type.child = child @property def id(self): return self.child.id # TODO: this should not be possible, but it should also be possible to define a FloatTensor # with a specific id. This is in theory possible, but it doesnt seem to work in practice @id.setter def id(self, new_id): self.child.id = new_id return self tensor_type.id = id @property def location(self): return self.child.location tensor_type.location = location @property def id_at_location(self): return self.child.id_at_location tensor_type.id_at_location = id_at_location @property def owner(self): return self.child.owner tensor_type.owner = owner
def flatten(t:torch.tensor): t = t.reshape((1,-1)) t = t.squeeze() print('after flatten:',t) return
def flatten_conv(conv_input: tensor) -> tensor: batch_size = list(conv_input.size())[0] return conv_input.view(batch_size, -1)
def forward(self, input: torch.tensor) -> torch.tensor: assert len(input.size( )) == 3, 'The number of dimensions of input tensor must be 3!' # reflect padding to match lengths of in/out input = F.pad(input, (1, 0), 'reflect') return F.conv1d(input, self.flipped_filter)
def to_np(t: torch.tensor) -> np.array: """Converts a PyTorch tensor to a Numpy array. """ return t.cpu().detach().numpy()
def forward(self, input: torch.tensor) -> torch.tensor: x, _ = self.rnn(input.transpose(1, 2)) return x.transpose(1, 2)
def forward(self, x: torch.tensor): x = x + self.pe[:, :x.size(1)] return self.dropout(x)
def mc_tensor(input: torch.tensor, k: int): mc_shape = [input.shape[0], k] + list(input.shape[1:]) return input.unsqueeze(1).expand(mc_shape).flatten(0, 1)
def acc_mean_from_confusion_matrix(self, cm: torch.tensor): cm[0] = cm[0:4].sum(dim=0) cm[1] = cm[4] cm = cm / torch.sum(cm, dim=1, keepdim=True) print(cm.diag()) return cm.diag().mean()
def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor): """ One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation), and possibly a parameter update (depending on the gradient accumulation). Input: ------ input_ids: `torch.tensor(bs, seq_length)` - The token ids. attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention. lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM). """ if self.mlm: s_logits, s_hidden_states = self.student( input_ids=input_ids, attention_mask=attention_mask ) # (bs, seq_length, voc_size) with torch.no_grad(): t_logits, t_hidden_states = self.teacher( input_ids=input_ids, attention_mask=attention_mask ) # (bs, seq_length, voc_size) else: s_logits, _, s_hidden_states = self.student( input_ids=input_ids, attention_mask=None ) # (bs, seq_length, voc_size) with torch.no_grad(): t_logits, _, t_hidden_states = self.teacher( input_ids=input_ids, attention_mask=None ) # (bs, seq_length, voc_size) assert s_logits.size() == t_logits.size() # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100 # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2 if self.params.restrict_ce_to_mask: mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size) else: mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size) s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask t_logits_slct = torch.masked_select(t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask assert t_logits_slct.size() == s_logits_slct.size() loss_ce = ( self.ce_loss_fct( F.log_softmax(s_logits_slct / self.temperature, dim=-1), F.softmax(t_logits_slct / self.temperature, dim=-1), ) * (self.temperature) ** 2 ) loss = self.alpha_ce * loss_ce if self.alpha_mlm > 0.0: loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1)) loss += self.alpha_mlm * loss_mlm if self.alpha_clm > 0.0: shift_logits = s_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) loss += self.alpha_clm * loss_clm if self.alpha_mse > 0.0: loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct) / s_logits_slct.size( 0 ) # Reproducing batchmean reduction loss += self.alpha_mse * loss_mse if self.alpha_cos > 0.0: s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim) t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim) mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states) # (bs, seq_length, dim) assert s_hidden_states.size() == t_hidden_states.size() dim = s_hidden_states.size(-1) s_hidden_states_slct = torch.masked_select(s_hidden_states, mask) # (bs * seq_length * dim) s_hidden_states_slct = s_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim) t_hidden_states_slct = torch.masked_select(t_hidden_states, mask) # (bs * seq_length * dim) t_hidden_states_slct = t_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim) target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,) loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target) loss += self.alpha_cos * loss_cos self.total_loss_epoch += loss.item() self.last_loss = loss.item() self.last_loss_ce = loss_ce.item() if self.alpha_mlm > 0.0: self.last_loss_mlm = loss_mlm.item() if self.alpha_clm > 0.0: self.last_loss_clm = loss_clm.item() if self.alpha_mse > 0.0: self.last_loss_mse = loss_mse.item() if self.alpha_cos > 0.0: self.last_loss_cos = loss_cos.item() self.optimize(loss) self.n_sequences_epoch += input_ids.size(0)
def to_numpy(torch_tensor: torch.tensor) -> np.array: return torch_tensor.cpu().detach().numpy()
def __init__( self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module ): logger.info("Initializing Distiller") self.params = params self.dump_path = params.dump_path self.multi_gpu = params.multi_gpu self.fp16 = params.fp16 self.student = student self.teacher = teacher self.student_config = student.config self.vocab_size = student.config.vocab_size if params.n_gpu <= 1: sampler = RandomSampler(dataset) else: sampler = DistributedSampler(dataset) if params.group_by_size: groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size) sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size) else: sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False) self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences) self.temperature = params.temperature assert self.temperature > 0.0 self.alpha_ce = params.alpha_ce self.alpha_mlm = params.alpha_mlm self.alpha_clm = params.alpha_clm self.alpha_mse = params.alpha_mse self.alpha_cos = params.alpha_cos self.mlm = params.mlm if self.mlm: logger.info(f"Using MLM loss for LM step.") self.mlm_mask_prop = params.mlm_mask_prop assert 0.0 <= self.mlm_mask_prop <= 1.0 assert params.word_mask + params.word_keep + params.word_rand == 1.0 self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand]) self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs if self.fp16: self.pred_probs = self.pred_probs.half() self.token_probs = self.token_probs.half() else: logger.info(f"Using CLM loss for LM step.") self.epoch = 0 self.n_iter = 0 self.n_total_iter = 0 self.n_sequences_epoch = 0 self.total_loss_epoch = 0 self.last_loss = 0 self.last_loss_ce = 0 self.last_loss_mlm = 0 self.last_loss_clm = 0 if self.alpha_mse > 0.0: self.last_loss_mse = 0 if self.alpha_cos > 0.0: self.last_loss_cos = 0 self.last_log = 0 self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100) if self.alpha_mse > 0.0: self.mse_loss_fct = nn.MSELoss(reduction="sum") if self.alpha_cos > 0.0: self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean") logger.info("--- Initializing model optimizer") assert params.gradient_accumulation_steps >= 1 self.num_steps_epoch = len(self.dataloader) num_train_optimization_steps = ( int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1 ) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": params.weight_decay, }, { "params": [ p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": 0.0, }, ] logger.info( "------ Number of trainable parameters (student): %i" % sum([p.numel() for p in self.student.parameters() if p.requires_grad]) ) logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()])) self.optimizer = AdamW( optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98) ) warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps ) if self.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level") self.student, self.optimizer = amp.initialize( self.student, self.optimizer, opt_level=self.params.fp16_opt_level ) self.teacher = self.teacher.half() if self.multi_gpu: if self.fp16: from apex.parallel import DistributedDataParallel logger.info("Using apex.parallel.DistributedDataParallel for distributed training.") self.student = DistributedDataParallel(self.student) else: from torch.nn.parallel import DistributedDataParallel logger.info("Using nn.parallel.DistributedDataParallel for distributed training.") self.student = DistributedDataParallel( self.student, device_ids=[params.local_rank], output_device=params.local_rank, find_unused_parameters=True, ) self.is_master = params.is_master if self.is_master: logger.info("--- Initializing Tensorboard") self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train")) self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0) self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)
def forward(self, input: torch.tensor): return input.view(input.size(0), -1)
def compute_avg(inp: List, nums: torch.tensor) -> float: "Computes average given list of torch.tensor and numbers corresponding to them" return (torch.stack(inp) * nums).sum() / nums.sum()
def predictive_distribution(self,X: torch.tensor, diagonal: bool=True, S_MC_NNet: int = None)-> list: """ This function computes the moments 1 and 2 from the predictive distribution. It also returns the posterior mean and covariance over latent functions. p(Y*|X*) = \int p(y*|G(f*)) q(f*,f|u) q(u) df*,df,du # Homoceodastic Gaussian observation model p(y|f) # GP variational distribution q(f) # G() represents a non-linear transformation Args: `X` (torch.tensor) :->: input locations where the predictive is computed. Can have shape (MB,Dx) or (Dy,MB,Dx) `diagonal` (bool) :->: if true, samples are drawn independently. For the moment is always true. `S_MC_NNet` (int) :->: Number of samples from the dropout distribution is fully_bayesian is true Returns: `m1` (torch.tensor) :->: Predictive mean with shape (Dy,MB) `m2` (torch.tensor) :->: Predictive variance with shape (Dy,MB). Takes None for classification likelihoods `mean_q_f` (torch.tensor) :->: Posterior mean of q(f) with shape (Dy,MB,1) [same shape as returned by marginal_variational_qf] `cov_q_f` (torch.tensor) :->: Posterior covariance of q(f) with shape (Dy,MB,1) [same shape as returned by marginal_variational_qf] """ if len(X.shape) == 2: X = X.repeat(self.out_dim,1,1) assert len(X.shape) == 3, "Bad input specificaton" self.eval() # set parameters for eval mode. Batch normalization, dropout etc if self.fully_bayesian: # activate dropout if required is_dropout = enable_eval_dropout(self.modules()) assert is_dropout, "You set the model to work on fully bayesian but there are no dropout layers in your model. I assert this error as otherwise the the code will work in non_bayesian operating mode" assert S_MC_NNet is not None, "The default parameter S_MC_NNet is not provided and set to default None, which is invalid for self.be_bayesian" with torch.no_grad(): if not diagonal: raise NotImplemented("This function does not support returning the predictive distribution with correlations") mean_q_f, cov_q_f = self.marginal_variational_qf_parameters(X, diagonal = True, is_duvenaud = False, init_Z = None) if self.fully_bayesian: # @NOTE: this has not been refactored as with the rest of the code. But note that we could do both point estimate and bayesian by setting S_MC_NNet = 1 for the non # bayesian case. # If it is fully Bayesian then do it as in the DGP with flows in the output layer Dy,MB,_ = mean_q_f.shape # 1. Reshape mean_q_f and cov_q_f to shape (Dy,S_MC_NNet*MB) mean_q_f_run = mean_q_f.view(Dy,MB).repeat(1,S_MC_NNet) cov_q_f_run = cov_q_f.view(Dy,MB).repeat(1,S_MC_NNet) # 2. Compute moments of each of the montecarlos. Just need to provide X extended to S_MC so that each forward computes a monte carlo X = X.repeat(1,S_MC_NNet,1) # expand to shape (Dy,S*MB,Dx). MOMENTS = self.likelihood.marginal_moments(mean_q_f_run, cov_q_f_run, self.G_matrix, X) # get the moments of each S*MB samples # 3. Compute the moments from the full predictive distribution, e.g the mixture of Gaussians for Gaussian Likelihood if isinstance(self.likelihood,GaussianNonLinearMean): m_Y,C_Y = MOMENTS m_Y = m_Y.view(Dy,S_MC_NNet,MB) C_Y = C_Y.view(Dy,S_MC_NNet,MB) m1 = m_Y.mean(1) m2 = ( C_Y + m_Y**2 ).mean(1) - m1**2 # var = 1/S * sum[K_Y + mu_y^2 ] -[1/S sum m1]^2 elif isinstance(self.likelihood,MulticlassCategorical) or isinstance(self.likelihood,Bernoulli): m1,m2 = MOMENTS,None m1 = m1.view(S_MC_NNet,MB,Dy) m1 = m1.mean(0) # reduce the monte carlo dimension else: raise ValueError("Unsupported likelihood [{}] for class [{}]".format(type(self.likelihood),type(self))) else: MOMENTS = self.likelihood.marginal_moments(mean_q_f.squeeze(dim = 2), cov_q_f.squeeze(dim = 2), diagonal = True, flow = self.G_matrix, X = X) # diagonal True always. Is an element only used by the sparse_MF_GP with SVI. Diag = False is used by standard GP's marginal likelihood if isinstance(self.likelihood,GaussianLinearMean) or isinstance(self.likelihood,GaussianNonLinearMean): m1,m2 = MOMENTS elif isinstance(self.likelihood,MulticlassCategorical) or isinstance(self.likelihood, Bernoulli): m1,m2 = MOMENTS,None self.train() # switch back to train mode. return m1,m2, mean_q_f, cov_q_f
def label_to_onehot(labels: torch.tensor, n_classes: int) -> torch.tensor: onehot = torch.nn.functional.one_hot(labels.type(torch.int64).flatten()).type(torch.float64) return onehot
def cov(x: torch.tensor) -> torch.tensor: x = x - torch.mean(x, dim=1, keepdim=True) return (1. / (x.size(1) - 1)) * x.matmul(x.t())
def tensor2list(cudatensor: torch.tensor) -> List: return list(cudatensor.cpu().numpy())