def analytical_q_cholesky(data, decoder_weights, decoder_bias, beta=1): ''' Compute the mean and covariance of the analytical q_beta with cholesky decomposition ''' W = decoder_weights b = decoder_bias WT = torch.t(W) I_x = torch.eye(data.size()[-1]) I_z = torch.eye(W.size()[-1]) data = torch.t(data) b = torch.unsqueeze(b, dim=1) subcore = torch.matmul(W, WT) + (1. / beta) * I_x L = torch.cholesky(subcore, upper=False) LT_XT = torch.trtrs(W, L, upper=False)[0] X_T = torch.trtrs(LT_XT, torch.t(L), upper=True)[0] core = torch.t(X_T) mu = torch.matmul(core, (data - b)) cov = I_z - torch.matmul(core, W) return mu, cov
def GP_fit_posterior(self, mjd, mag, err, P, end=1.0, jitter=1e-5): """ Expect a time series sampled at *mjd* instants (t) with values *mag* (m) and associated errors *err* (s) Returns the posterior mean and factorized covariance matrix of the GP sampled at instants x \[ \mu = K_{xt} (K_{tt} + \sigma^2 I + \text{diag}(s^2))^{-1} m, \] \[ \Sigma = K_{xx} - K_{xt} (K_{tt} + \sigma^2 I)^{-1} K_{xt}^T + \sigma^2 I \] where $\sigma^2$ is the variance of the noise. """ # Kernel matrices non_trainable_kparams = {'period': 1.0} reg_points = torch.unsqueeze(torch.linspace(start=0.0, end=1.0-1.0/self.n_pivots, steps=self.n_pivots), dim=0) mjd = torch.unsqueeze(mjd, dim=0) Ktt = self.stationary_kernel(mjd, mjd, non_trainable_kparams) Ktt += torch.diag(err**2) + torch.exp(self.gp_logvar_likelihood)*torch.eye(mjd.shape[1]) Ktx = self.stationary_kernel(mjd, reg_points, non_trainable_kparams) Kxx = self.stationary_kernel(reg_points, reg_points, non_trainable_kparams) Ltt = torch.potrf(Ktt, upper=False) # Cholesky lower triangular # posterior mean and covariance tmp1 = torch.t(torch.trtrs(Ktx, Ltt, upper=False)[0]) tmp2 = torch.trtrs(torch.unsqueeze(mag, dim=1), Ltt, upper=False)[0] mu =torch.t(torch.mm(tmp1, tmp2)) S = Kxx - torch.mm(tmp1, torch.t(tmp1)) #+ torch.exp(self.gp_logvar_likelihood)*torch.eye(self.n_pivots) R = torch.potrf(S + jitter*torch.eye(self.n_pivots), upper=True) return mu, R, reg_points
def update_precond_kron(Ql, Qr, dX, dG, step=0.01): """ update Kronecker product preconditioner P = kron_prod(Qr^T*Qr, Ql^T*Ql) Ql: (left side) Cholesky factor of preconditioner with positive diagonal entries Qr: (right side) Cholesky factor of preconditioner with positive diagonal entries dX: perturbation of (matrix) parameter dG: perturbation of (matrix) gradient step: normalized step size in range [0, 1] """ max_l = torch.max(torch.abs(Ql)) max_r = torch.max(torch.abs(Qr)) rho = torch.sqrt(max_l / max_r) Ql = Ql / rho Qr = rho * Qr A = Ql.mm(dG.mm(Qr.t())) Bt = torch.trtrs((torch.trtrs(dX.t(), Qr.t(), upper=False))[0].t(), Ql.t(), upper=False)[0] grad1 = torch.triu(A.mm(A.t()) - Bt.mm(Bt.t())) grad2 = torch.triu(A.t().mm(A) - Bt.t().mm(Bt)) step1 = step / (torch.max(torch.abs(grad1)) + _tiny) step2 = step / (torch.max(torch.abs(grad2)) + _tiny) return Ql - step1 * grad1.mm(Ql), Qr - step2 * grad2.mm(Qr)
def _ssor_preconditioner(self, A, v): DL = A.tril() D = A.diag() upper_part = (1 / D).expand_as(DL).mul(DL.t()) Minv_times_v = torch.trtrs( torch.trtrs(v, DL, upper=False)[0], upper_part)[0].squeeze() return Minv_times_v
def backward(self, grad_output): """ Giles, 2008, An extended collection of matrix derivative results for forward and reverse mode algorithmic differentiation), sec 2.3.1. Args: grad_output(sequence of (Tensor, Variable or None)): Gradients of the objective function w.r.t. each element of matrix X (output of :func:`forward`) Returns: Tensor: gradient w.r.t. A (triangular matrix) """ grad_A = grad_B = None A, X = self.saved_tensors if self.needs_input_grad[0]: grad_A = -torch.trtrs(grad_output, A, self.upper, transpose=True, unitriangular=False)[0].mm(X.t()) if self.upper: grad_A = torch.triu(grad_A) else: grad_A = torch.tril(grad_A) if self.needs_input_grad[1]: grad_B = torch.trtrs(grad_output, A, self.upper, transpose=True, unitriangular=False)[0] return grad_A, grad_B
def _set_pars(self, jitter): Ky = self.kernel(self.X, self.X) inds = list(range(len(Ky))) Ky[[inds], [inds]] += self.sn + jitter self.L = torch.potrf(Ky, upper=False) self.alpha = torch.trtrs(self.y, self.L, upper=False)[0] self.alpha = torch.trtrs(self.alpha, self.L.t(), upper=True)[0]
def fit(self, Y, K_dd, eps=1e-6): self.L = torch.potrf(K_dd + eps * torch.eye(K_dd.shape[0]), upper=False) self.alpha = torch.trtrs(torch.trtrs(Y, self.L, upper=False)[0], self.L.t(), upper=True)[0] return self
def _kl_lowrankmultivariatenormal_multivariatenormal(p, q): if p.event_shape != q.event_shape: raise ValueError( "KL-divergence between two (Low Rank) Multivariate Normals with\ different event shapes cannot be computed") term1 = ( 2 * q._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1) - _batch_lowrank_logdet(p._unbroadcasted_cov_factor, p._unbroadcasted_cov_diag, p._capacitance_tril)) term3 = _batch_mahalanobis(q._unbroadcasted_scale_tril, (q.loc - p.loc)) # Expands term2 according to # inv(qcov) @ pcov = inv(q_tril @ q_tril.T) @ (pW @ pW.T + pD) combined_batch_shape = torch._C._infer_size( q._unbroadcasted_scale_tril.shape[:-2], p._unbroadcasted_cov_factor.shape[:-2]) n = p.event_shape[0] q_scale_tril = q._unbroadcasted_scale_tril.expand(combined_batch_shape + (n, n)) p_cov_factor = p._unbroadcasted_cov_factor.expand(combined_batch_shape + (n, p.cov_factor.size(-1))) p_cov_diag = (torch.diag_embed( p._unbroadcasted_cov_diag.sqrt()).expand(combined_batch_shape + (n, n))) term21 = _batch_trace_XXT( torch.trtrs(p_cov_factor, q_scale_tril, upper=False)[0]) term22 = _batch_trace_XXT( torch.trtrs(p_cov_diag, q_scale_tril, upper=False)[0]) term2 = term21 + term22 return 0.5 * (term1 + term2 + term3 - p.event_shape[0])
def get_LL(self, train_inputs, train_outputs): # form the necessary kernel matrices Knn_diag = torch.exp(self.logsigmaf2) train_inputs_col = torch.unsqueeze(train_inputs.transpose(0, 1), 2) pseudoin_row = torch.unsqueeze(self.pseudoin.transpose(0, 1), 1) pseudoin_col = torch.unsqueeze(self.pseudoin.transpose(0, 1), 2) length_factors = (1. / (2. * torch.exp(self.logl2))).reshape(self.input_dim, 1, 1) Knm = self.get_K(train_inputs_col, pseudoin_row, length_factors) Kmn = Knm.transpose(0, 1) Kmm = self.get_K(pseudoin_col, pseudoin_row, length_factors) mKmm = torch.max(Kmm) L_Kmm = torch.potrf(Kmm + 1e-15*mKmm*torch.eye(self.num_pseudoin, device=device, dtype=torch.double), upper=False) L_slash_Kmn = torch.trtrs(Kmn, L_Kmm, upper=False)[0] Lambda_diag = torch.zeros(train_outputs.shape[0], 1, device=device, dtype=torch.double) diag_values = Lambda_diag + torch.exp(self.logsigman2) Qmm = Kmm + Kmn.matmul(Knm/diag_values) mQmm = torch.max(Qmm) L_Qmm = torch.potrf(Qmm + 1e-15*mQmm*torch.eye(self.num_pseudoin, device=device, dtype=torch.double), upper=False) # 1e-4 for boston L_slash_y = torch.trtrs(Kmn.matmul(train_outputs.view(-1, 1)/diag_values), L_Qmm, upper=False)[0] fit = ((train_outputs.view(-1, 1))**2/diag_values).sum()-(L_slash_y**2).sum() log_det = 2.*torch.sum(torch.log(torch.diag(L_Qmm))) -\ 2.*torch.sum(torch.log(torch.diag(L_Kmm))) +\ torch.sum(torch.log(diag_values)) # get log marginal likelihood LL = -0.5*train_outputs.shape[0]*torch.log(2.*np.pi*torch.ones(1, device=device, dtype=torch.double)) - 0.5*log_det - 0.5*fit return LL
def _ssor_preconditioner(self, lhs_mat, mat): if lhs_mat.ndimension() == 2: DL = lhs_mat.tril() D = lhs_mat.diag() upper_part = (1 / D).expand_as(DL).mul(DL.t()) Minv_times_mat = torch.trtrs( torch.trtrs(mat, DL, upper=False)[0], upper_part)[0] elif lhs_mat.ndimension() == 3: if mat.size(0) == 1 and lhs_mat.size(0) != 1: mat = mat.expand(*([lhs_mat.size(0)] + list(mat.size())[1:])) Minv_times_mat = mat.new(*mat.size()) for i in range(lhs_mat.size(0)): DL = lhs_mat[i].tril() D = lhs_mat[i].diag() upper_part = (1 / D).expand_as(DL).mul(DL.t()) Minv_times_mat[i].copy_( torch.trtrs( torch.trtrs(mat[i], DL, upper=False)[0], upper_part)[0]) else: raise RuntimeError('Invalid number of dimensions') return Minv_times_mat
def weight_inverse(self): """Cost: inverse = O(D^3) where: D = num of features """ lower, upper = self._create_lower_upper() identity = torch.eye(self.features, self.features) lower_inverse, _ = torch.trtrs(identity, lower, upper=False, unitriangular=True) weight_inverse, _ = torch.trtrs(lower_inverse, upper, upper=True, unitriangular=False) return weight_inverse
def solve_linear_system(self, A, b, K, delta=0.0): I = torch.eye(self.N, self.N, device=self.device) A_t_K = torch.mm(A.t(), K) A_t_A = torch.mm(A_t_K, A) LAM = A_t_A + delta * I R = torch.mm(A_t_K, b) #Solve using cholesky l = torch.cholesky(LAM, upper=False) z = torch.trtrs(R, l, transpose=False, upper=False)[0] dtheta = torch.trtrs(z, l, transpose=True, upper=False)[0] return dtheta.view(self.num_traj_states, self.state_dim)
def backward(self, grad_output): """ Reference: eqn (10) & (9) in Iain Murray, 2016, arXiv:1602.07527 """ L, = self.saved_tensors P = torch.tril(torch.mm(L.t(), grad_output)) P -= P.diag().diag() / 2. S = torch.trtrs(torch.trtrs(P + P.t(), L.t(), upper=True)[0].t(), L.t(), upper=True)[0] return S / 2.
def Fv( self ): # All the necessary arguments are instance variables, so no need to pass them no_train = self.Xn.shape[0] no_inducing = self.Xm.shape[0] # Calculate kernel matrices Kmm = self.get_K(self.Xm, self.Xm) Knm = self.get_K(self.Xn, self.Xm) Kmn = Knm.transpose(0, 1) # calculate the 'inner matrix' and Cholesky decompose M = Kmm + torch.exp(-self.logsigman2) * Kmn @ Knm L = torch.potrf(M + torch.mean(torch.diag(M)) * self.jitter_factor * torch.eye(no_inducing).type(torch.double), upper=False) # Compute first term (log of Gaussian pdf) # constant term constant_term = -(no_train / 2) * torch.log(torch.Tensor( [2 * np.pi])).type(torch.double) # quadratic term - Yn should be a column vector LslashKmny = torch.trtrs(Kmn @ self.Yn, L, upper=False)[0] quadratic_term = -0.5 * ( torch.exp(-self.logsigman2) * self.Yn.transpose(0, 1) @ self.Yn - torch.exp(-2 * self.logsigman2) * LslashKmny.transpose(0, 1) @ LslashKmny) # logdet term # Cholesky decompose the Kmm L_inducing = torch.potrf( Kmm + torch.mean(torch.diag(Kmm)) * self.jitter_factor * torch.eye(no_inducing).type(torch.double), upper=False) logdet_term = -0.5 * (2 * torch.sum(torch.log(torch.diag(L))) - 2 * torch.sum(torch.log(torch.diag(L_inducing))) + no_train * self.logsigman2) #import pdb; pdb.set_trace() log_gaussian_term = constant_term + logdet_term + quadratic_term # Compute the second term (trace regulariser) B = torch.trtrs(Kmn, L_inducing, upper=False)[0] trace_term = -0.5 * torch.exp(-self.logsigman2) * ( no_train * torch.exp(self.logsigmaf2) - torch.sum(B**2)) return log_gaussian_term + trace_term
def cho_solve_AXB(a, cho_C, b): """Compute tensor $a C^{-1} b$ from cholesky factor. ---- Parameters: a: (M x N) tensor cho_C: (N x N) lower triangular tensor where cho_C cho_C^T = C b: (N x L) tensor ---- Outputs: a C^{-1} b """ left, _ = torch.trtrs(a.t(), cho_C, upper=False) right, _ = torch.trtrs(b, cho_C, upper=False) return torch.mm(left.t(), right)
def linearised_laplace_direct_cholesky(self, L, test_inputs, optimizer=None): # do a numerically stable version of the algorithm if self.learned_noise_var == True: noise_variance = self.get_noise_var(self.noise_var_param) else: noise_variance = self.noise_variance # get list of test gradients no_test = test_inputs.size()[0] G = torch.cuda.DoubleTensor(self.no_params, no_test).fill_(0) for i in range(no_test): # clear gradients optimizer.zero_grad() # get gradient of output wrt single test input x = test_inputs[i] x = torch.unsqueeze( x, 0) # this may not be necessary if x is multidimensional gradient = self.get_gradient(x) # store in G G[:, i] = gradient # backsolve for all columns LslashG = torch.trtrs(G, L, upper=False)[0] # batch dot product predictive_var = noise_variance + torch.sum(LslashG**2, 0) return predictive_var.detach()
def _kl_multivariatenormal_lowrankmultivariatenormal(p, q): if p.event_shape != q.event_shape: raise ValueError( "KL-divergence between two (Low Rank) Multivariate Normals with\ different event shapes cannot be computed") term1 = ( _batch_lowrank_logdet(q._unbroadcasted_cov_factor, q._unbroadcasted_cov_diag, q._capacitance_tril) - 2 * p._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1)) term3 = _batch_lowrank_mahalanobis(q._unbroadcasted_cov_factor, q._unbroadcasted_cov_diag, q.loc - p.loc, q._capacitance_tril) # Expands term2 according to # inv(qcov) @ pcov = [inv(qD) - inv(qD) @ qW @ inv(qC) @ qW.T @ inv(qD)] @ p_tril @ p_tril.T # = [inv(qD) - A.T @ A] @ p_tril @ p_tril.T qWt_qDinv = (q._unbroadcasted_cov_factor.transpose(-1, -2) / q._unbroadcasted_cov_diag.unsqueeze(-2)) A = torch.trtrs(qWt_qDinv, q._capacitance_tril, upper=False)[0] term21 = _batch_trace_XXT(p._unbroadcasted_scale_tril * q._unbroadcasted_cov_diag.rsqrt().unsqueeze(-1)) term22 = _batch_trace_XXT(A.matmul(p._unbroadcasted_scale_tril)) term2 = term21 - term22 return 0.5 * (term1 + term2 + term3 - p.event_shape[0])
def statdist(v): v = v.pop() with timing("statdist"): n = v.shape[0] nanguardt(v, "t_generator") with timing("statdist::lu_factor_torch"): _, v = torch.gesv( torch.ones([n, 1], dtype=torch.float32).to(device), v) del _ nanguardt(v, "lu") # The last row contains 0's only. with timing("statdist::slices"): left = v[:-1, :-1] right = -v[:-1, -1] del v # Solves system `left * x = right`. Assumes that `left` is # upper-triangular (ignores lower triangle). #print("left shape:", left.shape, "right shape:", right.shape) #with timing("statdist::pytorch readback 1"): with timing("pytorch version"): res, _ = torch.trtrs(right.reshape(right.shape + (-1, )), left) del _ nanguardt(res, "res") res = res.view(-1) res = torch.cat((res, torch.ones(1, device=device))) return nanguardt((n / torch.sum(res)), "n/sum") * res
def predict(self, K_xd, K_xx): y = K_xd @ self.alpha v = torch.trtrs(K_xd.t(), self.L, upper=False)[0] var = (K_xx - v.t() @ v).diagonal() return y, var
def backward(ctx, grad_output): jitter = 1.0e-8 # do i really need this? z, epsilon, L = ctx.saved_tensors dim = L.shape[0] g = grad_output loc_grad = sum_leftmost(grad_output, -1) identity = eye_like(g, dim) R_inv = torch.trtrs(identity, L.t(), transpose=False, upper=True)[0] z_ja = z.unsqueeze(-1) g_R_inv = torch.matmul(g, R_inv).unsqueeze(-2) epsilon_jb = epsilon.unsqueeze(-2) g_ja = g.unsqueeze(-1) diff_L_ab = 0.5 * sum_leftmost(g_ja * epsilon_jb + g_R_inv * z_ja, -2) Sigma_inv = torch.mm(R_inv, R_inv.t()) V, D, _ = torch.svd(Sigma_inv + jitter) D_outer = D.unsqueeze(-1) + D.unsqueeze(0) expand_tuple = tuple([-1] * (z.dim() - 1) + [dim, dim]) z_tilde = identity * torch.matmul(z, V).unsqueeze(-1).expand(*expand_tuple) g_tilde = identity * torch.matmul(g, V).unsqueeze(-1).expand(*expand_tuple) Y = sum_leftmost(torch.matmul(z_tilde, torch.matmul(1.0 / D_outer, g_tilde)), -2) Y = torch.mm(V, torch.mm(Y, V.t())) Y = Y + Y.t() Tr_xi_Y = torch.mm(torch.mm(Sigma_inv, Y), R_inv) - torch.mm(Y, torch.mm(Sigma_inv, R_inv)) diff_L_ab += 0.5 * Tr_xi_Y L_grad = torch.tril(diff_L_ab) return loc_grad, L_grad, None
def chol_problem(final_state, target_cost_func): """ Gets a quadratic model on the target cost as h(y) = h(x) + <∇h(x), y-x> + 0.5 <y-x, H (y-x)> = cste + 0.5 || L^T y - L^(-1) ∇h(x)||^2 where, dennoting ∇h^2(x) = U D U^T, we denote H = U |D| U^T (absolute values of the eigenvalues of H are taken) and H = LL^T (cholesky decomposition of H) :param final_state: (torch.Tensor) last state on which the approximation of the cost is taken (x above) :param target_cost_func: (torch.nn.Module) Cost on the last state :return: chol_hess: (torch.Tensor) L above chol_hess_inv_grad: (torch.Tensor) L^(-1) ∇h(x) above """ aux = deepcopy(final_state.data) aux.requires_grad = True target_cost = target_cost_func(aux) grad = torch.autograd.grad(target_cost, aux, create_graph=True)[0] hess = auto_jac(grad, aux) (lam, U) = torch.eig(hess, eigenvectors=True) lam = torch.abs(lam[:, 0]) hess = torch.mm(U, torch.mm(torch.diag(lam), U.t())) chol_hess = torch.cholesky(hess, upper=False) chol_hess_inv_grad = torch.trtrs(grad, chol_hess, upper=False)[0].view(-1) return chol_hess, chol_hess_inv_grad
def train_locator_model(self, model_XTX, model_XTY, model=None): if model is None: model = torch.potrs(model_XTY, torch.potrf(model_XTX)) else: for _ in range(30): model, _ = torch.trtrs(model_XTY - torch.mm(torch.triu(model_XTX, diagonal=1), model), torch.tril(model_XTX, diagonal=0), upper=False) return model
def joint_posterior_predictive( self, test_inputs, noise=False): # assume test_inputs is a numpy array # get the mean and covariance of the joint Gaussian posterior over the test outputs test_inputs = torch.Tensor(test_inputs).type(torch.double) no_test = test_inputs.shape[0] no_inducing = self.Xm.shape[0] # Calculate kernel matrices Kxx = self.get_K(test_inputs, test_inputs) Kmx = self.get_K(self.Xm, test_inputs) Kmm = self.get_K(self.Xm, self.Xm) Knm = self.get_K(self.Xn, self.Xm) Kmn = Knm.transpose(0, 1) # calculate the 'inner matrix' and Cholesky decompose M = Kmm + torch.exp(-self.logsigman2) * Kmn @ Knm L = torch.potrf(M + torch.mean(torch.diag(M)) * self.jitter_factor * torch.eye(no_inducing).type(torch.double), upper=False) # Cholesky decompose the Kmm L_inducing = torch.potrf( Kmm + torch.mean(torch.diag(Kmm)) * self.jitter_factor * torch.eye(no_inducing).type(torch.double), upper=False) # backsolve LindslashKmx = torch.trtrs(Kmx, L_inducing, upper=False)[0] LslashKmx = torch.trtrs(Kmx, L, upper=False)[0] cov = Kxx - LindslashKmx.transpose( 0, 1) @ LindslashKmx + LslashKmx.transpose(0, 1) @ LslashKmx if noise == True: # add observation noise cov = cov + torch.exp(self.logsigman2) * torch.eye(no_test).type( torch.double) # calculate the predictive mean by backsolving LslashKmny = torch.trtrs(Kmn @ self.Yn, L, upper=False)[0] mean = torch.exp(-self.logsigman2) * LslashKmx.transpose( 0, 1) @ LslashKmny return mean, cov
def joint_posterior_predictive(self, train_inputs, train_outputs, test_inputs, noise=False): # form the necessary kernel matrices Knn_diag = torch.exp(self.logsigmaf2) train_inputs_col = torch.unsqueeze(train_inputs.transpose(0, 1), 2) pseudoin_row = torch.unsqueeze(self.pseudoin.transpose(0, 1), 1) pseudoin_col = torch.unsqueeze(self.pseudoin.transpose(0, 1), 2) length_factors = (1. / (2. * torch.exp(self.logl2))).reshape(self.input_dim, 1, 1) Knm = self.get_K(train_inputs_col, pseudoin_row, length_factors) Kmn = Knm.transpose(0, 1) Kmm = self.get_K(pseudoin_col, pseudoin_row, length_factors) mKmm = torch.max(Kmm) L_Kmm = torch.potrf(Kmm + 1e-15*mKmm*torch.eye(self.num_pseudoin, device=device, dtype=torch.double), upper=False) L_slash_Kmn = torch.trtrs(Kmn, L_Kmm, upper=False)[0] Lambda_diag = torch.zeros(train_outputs.shape[0], 1, device=device, dtype=torch.double) diag_values = Lambda_diag + torch.exp(self.logsigman2) Qmm = Kmm + Kmn.matmul(Knm/diag_values) mQmm = torch.max(Qmm) L_Qmm = torch.potrf(Qmm + 1e-15*mQmm*torch.eye(self.num_pseudoin, device=device, dtype=torch.double), upper=False) # 1e-4 for boston L_slash_y = torch.trtrs(Kmn.matmul(train_outputs.view(-1, 1)/diag_values), L_Qmm, upper=False)[0] no_test = test_inputs.size()[0] # get cross covariance between test and train points, Ktn test_inputs_col = torch.unsqueeze(test_inputs.transpose(0, 1), 2) test_inputs_row = torch.unsqueeze(test_inputs.transpose(0, 1), 1) Ktm = self.get_K(test_inputs_col, pseudoin_row, length_factors) Kmt = Ktm.transpose(0, 1) # get predictive mean LQslashKnt = torch.trtrs(Kmt, L_Qmm, upper=False)[0] LKslashKnt = torch.trtrs(Kmt, L_Kmm, upper=False)[0] pred_mean = LQslashKnt.transpose(0, 1) @ L_slash_y # get predictive covariance Ktt = self.get_K(test_inputs_col, test_inputs_row, length_factors) if noise: # add observation noise pred_cov = Ktt + torch.exp(self.logsigman2) * torch.eye(no_test, device=device, dtype=torch.double) +\ LQslashKnt.transpose(0, 1) @ LQslashKnt -\ LKslashKnt.transpose(0, 1) @ LKslashKnt else: pred_cov = Ktt + LQslashKnt.transpose(0, 1) @ LQslashKnt -\ LslashKnt.transpose(0, 1) @ LslashKnt + 1e-6 * torch.eye(no_test, device=device, dtype=torch.double) return pred_mean, pred_cov
def _batch_trtrs_lower(bb, bA): """ Applies `torch.trtrs` for batches of matrices. `bb` and `bA` should have the same batch shape. """ flat_b = bb.reshape((-1,) + bb.shape[-2:]) flat_A = bA.reshape((-1,) + bA.shape[-2:]) flat_X = torch.stack([torch.trtrs(b, A, upper=False)[0] for b, A in zip(flat_b, flat_A)]) return flat_X.reshape(bb.shape)
def cho_solve(cho_C, b): """Compute tensor $C^{-1} b$ from cholesky factor. ---- Parameters: cho_C: (N x N) lower triangular tensor where cho_C cho_C^T = C b: (N x L) tensor ---- Outputs: C^{-1} b ---- Note: Gradient of potrs is not supperted yet in pytorch 0.4.1 # return torch.potrs(b, cho_C, upper=False) """ tmp, _ = torch.trtrs(b, cho_C, upper=False) tmp2, _ = torch.trtrs(tmp, cho_C.t(), upper=True) return tmp2
def compute_sorted_nearest_neighbors(global_arrays, xbar): # x1 - x2 #####Xsub = (compute_expected_responses_globals.X_tensor - xbar).transpose(0, 1) #x_tensor = globals()[global_arrays_class_name].X_tensor Xsub = (global_arrays.X_tensor - xbar).transpose(0, 1) #####lower_diag = compute_expected_responses_globals.hyperparameters_object.upper_diag.clone().transpose(0, 1) hyperparameters_obj = global_arrays.hyperparameters_object lower_diag = hyperparameters_obj.upper_diag.clone().transpose(0, 1) Z = torch.trtrs(Xsub, lower_diag, upper=False)[0].transpose(0, 1) # mahalanobis_distances: mahalanobis distance of each X vector to Xbar # L2 norm -- note: square root not necessary, since we only car about sorting not absolute actual number # but since speed of this call is not a bottleneck, this is fine mahalanobis_distances = torch.norm(Z, p=2, dim=1) ## SORT the data based on distance to xbar mahalanobis_distances_sorted, sorted_indices = torch.sort(mahalanobis_distances, 0) #####Y_sorted = compute_expected_responses_globals.Y_tensor[sorted_indices] # now local scope Y_sorted = global_arrays.Y_tensor[sorted_indices] # now local scope # adjust k to avoid eliminating equi-distant points #####k = compute_expected_responses_globals.hyperparameters_object.k k = global_arrays.hyperparameters_object.k inclusive_distance_boundary = mahalanobis_distances_sorted[k - 1] + 1e-7 # cast to int because of weird incompatibility between zero-dim tensor and int in pytorch 0.4.0 inclusive_k = int(np.searchsorted(mahalanobis_distances_sorted, inclusive_distance_boundary, side='right')) # get indices of nearest neighbors inclusive_k_nearest_neighbor_indices = np.arange(inclusive_k) ''' # This is a template for applying non-naive smoother to weigh nearest-neighbor points # The code was functionally tested and can be used as is except the for loop for which a form of # broadcasting should be found, if possible for the smoother in question (for speed) weights = mahalanobis_distances[inclusive_k_nearest_neighbor_indices] / hyperparameters_object.bandwidth for i in inclusive_k_nearest_neighbor_indices: weights[i] = hyperparameters_object.smoother(weights[i]) weights = weights / sum(weights) # unsqueeze(1)/view(inclusive_k, 1) for broadcast multiplication to work as expected; # double() needed because smoother tested (naive) spits out a float (1.0) value instead of double. # double() likely won't be needed if/when this actually needs to be used # since smoother will likely divide/multiply an existing double() and therefore return a double weights = weights.view(inclusive_k, 1).double() # E[Y|xbar], ie weighted/"smoothed" average of the Y[i,:] corresponding to the nearest inclusive_k X expected_response[j] = torch.sum(weights * Y_tensor[inclusive_k_nearest_neighbor_indices].view( inclusive_k, num_assets), 0) ''' return Y_sorted[inclusive_k_nearest_neighbor_indices]
def loss(self, X, y, jitter, val=None): K = self.kernel(X, X) inds = list(range(len(K))) K[[inds], [inds]] += self.sn + jitter L = torch.potrf(K, upper=False) alpha = torch.trtrs(y, L, upper=False)[0] alpha = torch.trtrs(alpha, L.t(), upper=True)[0] loss = self.loss_func(L, alpha, y) if self.prior is not None: loss -= self.prior(self.sn) if val is not None: X_val, y_val = val k_star = self.kernel(X, X_val) mu = k_star.t() @ alpha mse = nn.MSELoss()(mu, y_val) return loss, mse else: return loss
def get_LL(self, train_inputs, train_outputs): # form the kernel matrix Knn Knn = self.get_K(train_inputs, train_inputs) # cholesky decompose L = torch.potrf( Knn + torch.exp(self.logsigman2) * torch.eye(train_inputs.shape[0]) + self.jitter * torch.eye(Knn.size()[0]), upper=False) # lower triangular decomposition Lslashy = torch.trtrs(train_outputs, L, upper=False)[0] alpha = torch.trtrs(Lslashy, torch.transpose(L, 0, 1))[0] # get log marginal likelihood LL = -0.5 * torch.dot(train_outputs, torch.squeeze(alpha)) - torch.sum( torch.log( torch.diag(L))) - (train_inputs.shape[0] / 2) * torch.log( torch.Tensor([2 * 3.1415926536])) return LL
def posterior(self, Xtest): # assumes stationary kernel with torch.no_grad(): if isinstance(self.y, Sparse1DTensor): ix = self.get_batch.ix Ks = self.kernel(self.X[ix], Xtest) L = self.get_cov(ix) alpha = torch.trtrs(Ks, L, upper=False)[0] fmean = torch.matmul( torch.t(alpha), torch.trtrs(self.y.v.squeeze(), L, upper=False)[0]) else: Ks = self.kernel(self.X, Xtest) L = self.get_cov() alpha = torch.trtrs(Ks, L, upper=False)[0] fmean = torch.matmul(torch.t(alpha), torch.trtrs(self.y, L, upper=False)[0]) fvar = transform_forward(self.kernel.variance) - (alpha**2).sum(0) return fmean, fvar.reshape((-1, 1))