def pre_factor_kkt(Q, G, A): """ Perform all one-time factorizations and cache relevant matrix products""" nineq, nz, neq, _ = get_sizes(G, A) # S = [ A Q^{-1} A^T A Q^{-1} G^T ] # [ G Q^{-1} A^T G Q^{-1} G^T + D^{-1} ] U_Q = torch.potrf(Q) # partial cholesky of S matrix U_S = torch.zeros(neq + nineq, neq + nineq).type_as(Q) G_invQ_GT = torch.mm(G, torch.potrs(G.t(), U_Q)) R = G_invQ_GT if neq > 0: invQ_AT = torch.potrs(A.t(), U_Q) A_invQ_AT = torch.mm(A, invQ_AT) G_invQ_AT = torch.mm(G, invQ_AT) # TODO: torch.potrf sometimes says the matrix is not PSD but # numpy does? I filed an issue at # https://github.com/pytorch/pytorch/issues/199 try: U11 = torch.potrf(A_invQ_AT) except: U11 = torch.Tensor(np.linalg.cholesky( A_invQ_AT.cpu().numpy())).type_as(A_invQ_AT) # TODO: torch.trtrs is currently not implemented on the GPU # and we are using gesv as a workaround. U12 = torch.gesv(G_invQ_AT.t(), U11.t())[0] U_S[:neq, :neq] = U11 U_S[:neq, neq:] = U12 R -= torch.mm(U12.t(), U12) return U_Q, U_S, R
def GP_fit_posterior(self, mjd, mag, err, P, end=1.0, jitter=1e-5): """ Expect a time series sampled at *mjd* instants (t) with values *mag* (m) and associated errors *err* (s) Returns the posterior mean and factorized covariance matrix of the GP sampled at instants x \[ \mu = K_{xt} (K_{tt} + \sigma^2 I + \text{diag}(s^2))^{-1} m, \] \[ \Sigma = K_{xx} - K_{xt} (K_{tt} + \sigma^2 I)^{-1} K_{xt}^T + \sigma^2 I \] where $\sigma^2$ is the variance of the noise. """ # Kernel matrices non_trainable_kparams = {'period': 1.0} reg_points = torch.unsqueeze(torch.linspace(start=0.0, end=1.0-1.0/self.n_pivots, steps=self.n_pivots), dim=0) mjd = torch.unsqueeze(mjd, dim=0) Ktt = self.stationary_kernel(mjd, mjd, non_trainable_kparams) Ktt += torch.diag(err**2) + torch.exp(self.gp_logvar_likelihood)*torch.eye(mjd.shape[1]) Ktx = self.stationary_kernel(mjd, reg_points, non_trainable_kparams) Kxx = self.stationary_kernel(reg_points, reg_points, non_trainable_kparams) Ltt = torch.potrf(Ktt, upper=False) # Cholesky lower triangular # posterior mean and covariance tmp1 = torch.t(torch.trtrs(Ktx, Ltt, upper=False)[0]) tmp2 = torch.trtrs(torch.unsqueeze(mag, dim=1), Ltt, upper=False)[0] mu =torch.t(torch.mm(tmp1, tmp2)) S = Kxx - torch.mm(tmp1, torch.t(tmp1)) #+ torch.exp(self.gp_logvar_likelihood)*torch.eye(self.n_pivots) R = torch.potrf(S + jitter*torch.eye(self.n_pivots), upper=True) return mu, R, reg_points
def predict_f(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = self.Z.size(0) err = self.Y - self.mean_function(self.X) Kuf = self.kern.K(self.Z.get(), self.X) jitter = Variable(torch.eye(num_inducing, out=self.Z.data.new())) * self.jitter_level Kuu = self.kern.K(self.Z.get()) + jitter Kus = self.kern.K(self.Z.get(), Xnew) sigma = self.likelihood.variance.get()**0.5 L = torch.potrf(Kuu, upper=False) A = torch.gesv( Kuf, L)[0] / sigma # could use triangular solve here and below B = torch.matmul(A, A.t()) + Variable( torch.eye(num_inducing, out=A.data.new())) LB = torch.potrf(B, upper=False) Aerr = torch.matmul(A, err) c = torch.gesv(Aerr, LB)[0] / sigma tmp1, _ = torch.gesv(Kus, L) tmp2, _ = torch.gesv(tmp1, LB) mean = torch.matmul(tmp2.t(), c) if full_cov: var = self.kern.K(Xnew) + torch.matmul( tmp2.t(), tmp2) - torch.matmul(tmp1.t(), tmp1) var = var.unsqueeze(2).expand(-1, -1, self.Y.size(1)) else: var = self.kern.Kdiag(Xnew) + (tmp2**2).sum(0) - (tmp1**2).sum(0) var = var.unsqueeze(1).expand(-1, self.Y.size(1)) return mean + self.mean_function(Xnew), var
def get_LL(self, train_inputs, train_outputs): # form the necessary kernel matrices Knn_diag = torch.exp(self.logsigmaf2) train_inputs_col = torch.unsqueeze(train_inputs.transpose(0, 1), 2) pseudoin_row = torch.unsqueeze(self.pseudoin.transpose(0, 1), 1) pseudoin_col = torch.unsqueeze(self.pseudoin.transpose(0, 1), 2) length_factors = (1. / (2. * torch.exp(self.logl2))).reshape(self.input_dim, 1, 1) Knm = self.get_K(train_inputs_col, pseudoin_row, length_factors) Kmn = Knm.transpose(0, 1) Kmm = self.get_K(pseudoin_col, pseudoin_row, length_factors) mKmm = torch.max(Kmm) L_Kmm = torch.potrf(Kmm + 1e-15*mKmm*torch.eye(self.num_pseudoin, device=device, dtype=torch.double), upper=False) L_slash_Kmn = torch.trtrs(Kmn, L_Kmm, upper=False)[0] Lambda_diag = torch.zeros(train_outputs.shape[0], 1, device=device, dtype=torch.double) diag_values = Lambda_diag + torch.exp(self.logsigman2) Qmm = Kmm + Kmn.matmul(Knm/diag_values) mQmm = torch.max(Qmm) L_Qmm = torch.potrf(Qmm + 1e-15*mQmm*torch.eye(self.num_pseudoin, device=device, dtype=torch.double), upper=False) # 1e-4 for boston L_slash_y = torch.trtrs(Kmn.matmul(train_outputs.view(-1, 1)/diag_values), L_Qmm, upper=False)[0] fit = ((train_outputs.view(-1, 1))**2/diag_values).sum()-(L_slash_y**2).sum() log_det = 2.*torch.sum(torch.log(torch.diag(L_Qmm))) -\ 2.*torch.sum(torch.log(torch.diag(L_Kmm))) +\ torch.sum(torch.log(diag_values)) # get log marginal likelihood LL = -0.5*train_outputs.shape[0]*torch.log(2.*np.pi*torch.ones(1, device=device, dtype=torch.double)) - 0.5*log_det - 0.5*fit return LL
def factCore(V, reduce_flag=False): r"""Computes :math:`K` such that :math:`I_n + VKV^\top` is a square-root for :math:`I_n + VV^\top` Arguments: V (Tensor): a low-rank matrix of size [n x k] """ try: if reduce_flag: V = reduceRank(V) I_k = torch.eye(V.shape[1], dtype=V.dtype, device=V.device) L = torch.potrf(V.t() @ V, upper=False) M = torch.potrf(I_k + L.t() @ L, upper=False) Linv = torch.inverse(L) K = Linv.t() @ (M - I_k) @ Linv except RuntimeError as err: if reduce_flag: raise if str(err).startswith(NOT_FULL_RANK_ERR_MSG): warnings.warn( "The factor matrix is not full-rank. Torchutil will attempt to remove unused dimensions. This might impact performance." ) return factCore(V, reduce_flag=True) else: raise return K, V
def forward(self, A, B): dim = A.size(0) logdet = torch.log( torch.potrf(A).diag().prod() / (torch.potrf(B).diag().prod() + 0.00001) + 0.00001) kl = torch.mm(B.inverse(), A).trace() - dim + logdet return 0.5 * kl
def factor_solve_kkt(Q, D, G, A, rx, rs, rz, ry): nineq, nz, neq, _ = get_sizes(G, A) if neq > 0: H_ = torch.cat([ torch.cat([Q, torch.zeros(nz, nineq).type_as(Q)], 1), torch.cat([torch.zeros(nineq, nz).type_as(Q), D], 1) ], 0) A_ = torch.cat([ torch.cat([G, torch.eye(nineq).type_as(Q)], 1), torch.cat([A, torch.zeros(neq, nineq).type_as(Q)], 1) ], 0) g_ = torch.cat([rx, rs], 0) h_ = torch.cat([rz, ry], 0) else: H_ = torch.cat([ torch.cat([Q, torch.zeros(nz, nineq).type_as(Q)], 1), torch.cat([torch.zeros(nineq, nz).type_as(Q), D], 1) ], 0) A_ = torch.cat([G, torch.eye(nineq).type_as(Q)], 1) g_ = torch.cat([rx, rs], 0) h_ = rz U_H_ = torch.potrf(H_) invH_A_ = torch.potrs(A_.t(), U_H_) invH_g_ = torch.potrs(g_.view(-1, 1), U_H_).view(-1) S_ = torch.mm(A_, invH_A_) U_S_ = torch.potrf(S_) t_ = torch.mv(A_, invH_g_).view(-1, 1) - h_ w_ = -torch.potrs(t_, U_S_).view(-1) v_ = torch.potrs(-g_.view(-1, 1) - torch.mv(A_.t(), w_), U_H_).view(-1) return v_[:nz], v_[nz:], w_[:nineq], w_[nineq:] if neq > 0 else None
def forward(ctx, matrix): ctx.save_for_backward(matrix) try: chol_from_upper = torch.potrf(matrix, True) chol_from_lower = torch.potrf(matrix, False) return (torch.sum(torch.log(torch.diag(chol_from_upper)), 0, keepdim=True) + torch.sum(torch.log(torch.diag(chol_from_lower)), 0, keepdim=True)).view(1, 1) except RuntimeError: eigvals = torch.symeig(matrix)[0] return torch.sum(torch.log(eigvals[eigvals > 0]), 0, keepdim=True)
def Fv( self ): # All the necessary arguments are instance variables, so no need to pass them no_train = self.Xn.shape[0] no_inducing = self.Xm.shape[0] # Calculate kernel matrices Kmm = self.get_K(self.Xm, self.Xm) Knm = self.get_K(self.Xn, self.Xm) Kmn = Knm.transpose(0, 1) # calculate the 'inner matrix' and Cholesky decompose M = Kmm + torch.exp(-self.logsigman2) * Kmn @ Knm L = torch.potrf(M + torch.mean(torch.diag(M)) * self.jitter_factor * torch.eye(no_inducing).type(torch.double), upper=False) # Compute first term (log of Gaussian pdf) # constant term constant_term = -(no_train / 2) * torch.log(torch.Tensor( [2 * np.pi])).type(torch.double) # quadratic term - Yn should be a column vector LslashKmny = torch.trtrs(Kmn @ self.Yn, L, upper=False)[0] quadratic_term = -0.5 * ( torch.exp(-self.logsigman2) * self.Yn.transpose(0, 1) @ self.Yn - torch.exp(-2 * self.logsigman2) * LslashKmny.transpose(0, 1) @ LslashKmny) # logdet term # Cholesky decompose the Kmm L_inducing = torch.potrf( Kmm + torch.mean(torch.diag(Kmm)) * self.jitter_factor * torch.eye(no_inducing).type(torch.double), upper=False) logdet_term = -0.5 * (2 * torch.sum(torch.log(torch.diag(L))) - 2 * torch.sum(torch.log(torch.diag(L_inducing))) + no_train * self.logsigman2) #import pdb; pdb.set_trace() log_gaussian_term = constant_term + logdet_term + quadratic_term # Compute the second term (trace regulariser) B = torch.trtrs(Kmn, L_inducing, upper=False)[0] trace_term = -0.5 * torch.exp(-self.logsigman2) * ( no_train * torch.exp(self.logsigmaf2) - torch.sum(B**2)) return log_gaussian_term + trace_term
def generate_momentum(self, q): dV = self.linkedV.getdV_tensor(q) msoftabsalpha = self.metric.msoftabsalpha gg = torch.dot(dV, dV) agg = msoftabsalpha * gg #print(gg) #print(agg) exit() dV = dV * math.sqrt((numpy.cosh(agg) - 1) / gg) mH = torch.zeros(len(dV), len(dV)) for i in range(len(dV)): v = dV[i] L = 1. r = math.sqrt(L * L + v * v) c = L / r s = v / r mH[i, i] = r for j in range(len(dV)): vprime = dV[j] Lprime = mH[i, j] dV[j] = c * vprime - s * Lprime mH[i, j] = s * vprime + c * Lprime mH = mH * math.sqrt(gg / numpy.sinh(agg)) #print(mH) exit() mHL = torch.potrf(mH, upper=False) out = point(None, self) out.flattened_tensor.copy_(torch.mv(mHL, torch.randn(len(dV)))) out.load_flatten() return (out)
def fit(self, Y, K_dd, eps=1e-6): self.L = torch.potrf(K_dd + eps * torch.eye(K_dd.shape[0]), upper=False) self.alpha = torch.trtrs(torch.trtrs(Y, self.L, upper=False)[0], self.L.t(), upper=True)[0] return self
def one_expected_improvement(self): assert self.my_param.shape[1] == self.x_train.shape[1] assert self.my_param.shape[0] == 1 #import ipdb; ipdb.set_trace() f_max = self.y_train.max() out_covar = self.covar_pred() #import ipdb; ipdb.set_trace() out_mean = self.mean_pred() L_x = torch.potrf(out_covar, upper=False) if self.sampling_type == 'MC': Z = torch.normal( torch.ones(self.my_param.shape[0], self.sample_size)) elif self.sampling_type == 'RQMC': z_normals = sobol_sequence(self.sample_size, self.my_param.shape[0], iSEED=np.random.randint(10**5), TRANSFORM=1).transpose() #import ipdb; ipdb.set_trace() Z = torch.tensor(z_normals, dtype=torch.float32, requires_grad=False) else: raise ValueError('samling type does not exist') min_value, __ = torch.min(out_mean + L_x.mm(Z), dim=0) inner_term = torch.max((f_max - min_value), torch.zeros(self.sample_size)) #import ipdb; ipdb.set_trace() return inner_term.mean()
def gauss_kl(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. """ L = torch.potrf(K, upper=False) alpha, _ = torch.gesv(q_mu, L) KL = 0.5 * (alpha**2).sum() # Mahalanobis term. num_latent = q_sqrt.size(2) KL += num_latent * torch.tiag(L).log().sum() # Prior log-det term. KL += -0.5 * numpy.prod(q_sqrt.size()[1:]) # constant term Lq = batch_tril(q_sqrt.permute(2, 0, 1)) # force lower triangle KL += batch_diag(Lq).log().sum() # logdet LiLq, _ = torch.gesv(Lq.view(-1, L.size(-1)), L).view(*L.size()) # batch with same LHS KL += 0.5 * (LiLq**2).sum() # Trace term return KL
def mvnquad(f, means, covs, H, Din, Dout=()): """ Computes N Gaussian expectation integrals of a single function 'f' using Gauss-Hermite quadrature. Args: f: integrand function. Takes one input of shape ?xD. means: NxD covs: NxDxD H: Number of Gauss-Hermite evaluation points. Din: Number of input dimensions. Needs to be known at call-time. Dout: Number of output dimensions. Defaults to (). Dout is assumed to leave out the item index, i.e. f actually maps (?xD)->(?x*Dout). Returns: quadratures (N,*Dout) """ xn, wn = mvhermgauss(H, Din) N = means.size(0) # Transform points based on Gaussian parameters Xt = [] for c in covs: chol_cov = torch.potrf(c, upper=False) # DxD each Xt.append(torch.matmul(chol_cov, xn.t())) Xt = torch.stack(Xt, dim=0) # NxDx(H**D) X = 2.0 ** 0.5 * Xt + means.unsqueeze(2) # NxDx(H**D) Xr = X.permute(2, 0, 1).view(-1, Din) # (H**D*N)xD # Perform quadrature fX = f(Xr).view(*((H ** Din, N,) + Dout)) wr = (wn * float(np.pi) ** (-Din * 0.5)).view(*((-1,) + (1,) * (1 + len(Dout)))) return (fX * wr).sum(0)
def _set_pars(self, jitter): Ky = self.kernel(self.X, self.X) inds = list(range(len(Ky))) Ky[[inds], [inds]] += self.sn + jitter self.L = torch.potrf(Ky, upper=False) self.alpha = torch.trtrs(self.y, self.L, upper=False)[0] self.alpha = torch.trtrs(self.alpha, self.L.t(), upper=True)[0]
def loss(self, batch_size): mu, marginals, samples = self.forward(batch_size) ico = [] det = [] for i in range(self.num_samples): m = marginals[i, :, :] ico.append(torch.inverse(m).unsqueeze(0)) det.append(torch.potrf(m).diag().prod()**2) ico = torch.cat(ico, 0).repeat(batch_size, 1, 1) det = torch.cat(det).unsqueeze(0) y = (samples - mu).view(-1, self.z_dim, 1) a = torch.matmul(ico, y) z = torch.matmul(torch.transpose(y, 1, 2), a) z = z.view(-1, self.num_samples) logq = -0.5 * z - 0.5 * self.z_dim * np.log( 2 * np.pi) - 0.5 * torch.log(det) logp = self.p.logprob(samples) loss = log_mean_exp(logp - logq) loss = -torch.mean(loss) return loss
def predict_f(self, Xnew, full_cov=False): """ Xnew is a data matrix, point at which we want to predict This method computes p(F* | Y ) where F* are points on the GP at Xnew, Y are noisy observations at X. """ Kx = self.kern.K(self.X, Xnew) K = self.kern.K(self.X) + Variable( torch.eye(self.X.size(0), out=self.X.data.new())) * self.likelihood.variance.get() L = torch.potrf(K, upper=False) A, _ = torch.gesv( Kx, L ) # could use triangular solve, note gesv has B first, then A in AX=B V, _ = torch.gesv(self.Y - self.mean_function(self.X), L) # could use triangular solve fmean = torch.mm(A.t(), V) + self.mean_function(Xnew) if full_cov: fvar = self.kern.K(Xnew) - torch.mm(A.t(), A) fvar = fvar.unsqueeze(2).expand(fvar.size(0), fvar.size(1), self.Y.size(1)) else: fvar = self.kern.Kdiag(Xnew) - (A**2).sum(0) fvar = fvar.view(-1, 1) fvar = fvar.expand(fvar.size(0), self.Y.size(1)) return fmean, fvar
def generate_momentum_wrap(metric, var_vec=None, Cov=None, V=None, alpha=None): # Cov is the covariance of the momentum distribution , NNNNNOOOOOOTTTTT the empirical sample covariance # the covariance for momentum = Cov^-1 # returns tensor # generate from prob(p given q) if (metric == "unit_e"): def generate(q): return (torch.randn(len(q))) elif (metric == "diag_e"): sd = torch.sqrt(var_vec) #inv_sd = 1/sd def generate(q): return (torch.randn(len(q)) * sd) elif (metric == "dense_e"): #print(Cov) L = torch.potrf(a=Cov, upper=False) L_t = L.t() #L_inv = torch.inverse(L) def generate(q): return (torch.mv(L_t, torch.randn(len(q)))) elif (metric == "softabs"): def generate(q): lam, Q = eigen(getH(q, V).data) temp = torch.mm(Q, torch.diag(torch.sqrt(softabs_map(lam, alpha)))) out = torch.mv(temp, torch.randn(len(lam))) return (out) else: # should raise error here return ("error") return (generate)
def __init__(self, x_dim, h_dim, t_dim): super(VAE_BKDG, self).__init__() self.x_dim = x_dim self.h_dim = h_dim self.t_dim = t_dim d_dim = int(x_dim/t_dim) self.d_dim = d_dim l_dim = int(d_dim * (d_dim+1)/2) self.l_dim = l_dim z_dim = t_dim * l_dim self.z_dim = z_dim # feature self.fc0 = nn.Linear(x_dim, h_dim) self.fc1 = nn.Linear(h_dim, h_dim) # encode self.fc21 = nn.Linear(h_dim, z_dim) self.fc22 = nn.Linear(h_dim, int(t_dim*(t_dim+1)/2)) # transform self.fc2 = nn.Linear(z_dim, h_dim) self.fc3 = nn.Linear(h_dim, h_dim) # decode self.fc41 = nn.Linear(h_dim, x_dim) self.fc42 = nn.Linear(h_dim, x_dim) self.relu = nn.ReLU() self.sigmoid = nn.Sigmoid() self.tanh = nn.Tanh() t = torch.linspace(0,2,steps=t_dim+1); t = t[1:] self.K = Variable(torch.exp(-torch.pow(t.unsqueeze(1)-t.unsqueeze(0),2)/2/2) + 1e-4*torch.eye(t_dim)) self.Kh = torch.potrf(self.K) self.iK = torch.potri(self.Kh)
def set_metric(self, input_var): # input: either flattened empircial covariance for dense_e or # flattened var tensor for diag_e if self.name == "diag_e": try: # none of the variances or negative assert not sum(input_var < 0) > 0 # none of the variances are too small or too large assert not sum(input_var < 1e-8) > 0 and not sum( input_var > 1e8) > 0 self._flattened_var.copy_(input_var) #self._flattened_sd.copy_(torch.sqrt(self._flattened_var)) self._load_flatten() except: raise ValueError("negative var or extreme var values") elif self.name == "dense_e": try: temp_cov_inv = torch.inverse(input_var) temp_cov_L = torch.potrf(input_var, upper=False) #self._flattened_cov.copy_(input_var) self._flattened_cov_L.copy_(temp_cov_L) self._flattened_cov_inv.copy_(temp_cov_inv) except: raise ValueError("not decomposable") else: raise ValueError( "should not use this function unless the metrics are diag_e or dense_e" )
def pd_to_vec(A): """Convert a positive-definite matrix A to a vector l of entries from its cholesky factor. Diagonal entries are logged so they occupy the full real line, and still map back to positive values. """ L = torch.potrf(A, upper=False) return trilpd_to_vec(L)
def gauss_kl_diag(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume multiple independent distributions, given by the columns of q_mu and q_sqrt. q_mu is a matrix, each column contains a mean q_sqrt is a matrix, each column represents the diagonal of a square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. """ L = torch.potrf(K, upper=False) alpha, _ = torch.gesv(q_mu, L) KL = 0.5 * (alpha**2).sum() # Mahalanobis term. num_latent = q_sqrt.size(1) KL += num_latent * torch.diag(L).log().sum() # Prior log-det term. KL += -0.5 * q_sqrt.numel() # constant term KL += -q_sqrt.log().sum() # Log-det of q-cov K_inv, _ = torch.potrs(Variable(torch.eye(L.size(0), out=L.data.new())), L, upper=False) KL += 0.5 * (torch.diag(K_inv).unsqueeze(1) * q_sqrt**2).sum() # Trace term. return KL
def chol_orthogonalize(vector_matrix): VV = vector_matrix @ vector_matrix.t() + 0.01 * to.eye( vector_matrix.shape[0]) R = to.potrf(VV, upper=True) U = vector_matrix.t() @ to.inverse(R) return U
def __init__(self, x_dim, h_dim, z_dim): super(VAE, self).__init__() self.x_dim = x_dim # ND self.h_dim = h_dim self.z_dim = z_dim # ND^* self.t_dim = np.int(x_dim / (2 * z_dim / x_dim - 1)) # N # feature self.fc0 = nn.Linear(x_dim, h_dim) # encode self.fc21 = nn.Linear(h_dim, z_dim) self.fc22 = nn.Linear(h_dim, np.int(self.t_dim * (self.t_dim + 1) / 2)) # self.fc23 = nn.Linear(h_dim, z_dim) # transform self.fc3 = nn.Linear(z_dim, h_dim) # decode self.fc41 = nn.Linear(h_dim, x_dim) self.fc42 = nn.Linear(h_dim, x_dim) self.relu = nn.ReLU() self.sigmoid = nn.Sigmoid() self.tanh = nn.Tanh() # problem-specific parameters self.D = np.int(self.z_dim / self.x_dim * 2 - 1) self.N = np.int(self.x_dim / self.D) # GP kernel t = torch.linspace(0, 2, steps=self.N + 1) t = t[1:] self.K = Variable( torch.exp(-torch.pow(t.unsqueeze(1) - t.unsqueeze(0), 2) / 2 / 2) + 1e-4 * torch.eye(self.N)) self.Kh = torch.potrf(self.K) # self.iK = Variable(torch.inverse(self.K.data)) self.iK = torch.potri(self.Kh)
def forward(self, A): """Cholesky decomposition with jittering Add jitter to matrix A if A is not positive definite, increase the amount of jitter w.r.t number of tries. This function uses LAPACK routine:: torch.potrf(A, upper=True) -> Tensor Only enables lower factorization, i.e. A = LL' """ success = False max_tries = 10 i = 0 while i < max_tries and not success: i += 1 try: L = torch.potrf(A, upper=False) success = True except RuntimeError as e: if e.args[0].startswith('Lapack Error in potrf'): print('Warning: Cholesky error for the %d time' % i) A += A.diag().mean(0).expand(A.size(0),).diag() * 1e-6 * \ pow(10, i-1) # print(self.flag) if i == max_tries: raise e self.save_for_backward(L) return L
def nystrom(Q, anorm): r""" Use the Nystrom method to obtain approximations to the eigenvalues and eigenvectors of A (shifting A on the subspace spanned by the columns of Q in order to make the shifted A be positive definite). """ def svd_thin_matrix(A): r""" Efficient implementation of SVD on [N x D] matrix, D >> N. """ (e, V) = torch.symeig(A @ A.t(), eigenvectors=True) Sigma = torch.sqrt(e) SigInv = 1 / Sigma SigInv[torch.isnan(SigInv)] = 0 U = A.t() @ (V * SigInv) return U, Sigma, V anorm = .1e-6 * anorm * math.sqrt(1. * n) E = f(Q) + anorm * Q R = Q.t() @ E R = (R + R.t()) / 2 R = torch.potrf(R, upper=False) # Cholesky (tmp, _) = torch.gesv(E.t(), R) # Solve V, d, _ = svd_thin_matrix(tmp) d = d * d - anorm return d, V
def step(self): for l in self.linear_layers: # Mini-batch updates # theta_(t+1) = theta_t + 2 * (gamma * I_hat + N * grad_avg(theta_t; X_t))^⁻1 * ( grad(log p(theta_t)) + N * grad_avg(theta_t) + eta_t) # with eta_t ~ N(0, 4 * B / eta_t) # According to Ahn et al. (2012): B \propto N*I_hat # Porbably scale problem here!!! # if self.t < 10: # I_hat_inv = torch.eye(self.I_hat[l].size(0)) # else: eps = 1e-8 * 10**-(self.t // 10) B = self.I_hat[l] mat = self.gamma * self.I_hat[l] + 4. * B / self.epsilon mat_inv = torch.inverse(mat.add(torch.eye(self.I_hat[l].size(0)))) #mat_inv = torch.inverse(mat) # Cholesky factor of matrix B B_ch = torch.potrf(B.add(eps, torch.eye(self.I_hat[l].size(0))), upper=True) #B_ch = torch.potrf(B, upper=False) noise = (self.noise_factor * B_ch).mm( torch.randn_like(self.grad_mean[l])) # Update in parameter space update = 2. * (mat_inv).mm((self.grad_mean[l]).add_( self.lambda_ / self.N, l.weight.data).add_(noise)) l.weight.data.add_(-update) #print(update) self.t += 1
def test_interpolated_toeplitz_gp_marginal_log_likelihood_forward(): x = Variable(torch.linspace(0, 1, 5)) y = torch.randn(5) noise = torch.Tensor([1e-4]) rbf_covar = RBFKernel() rbf_covar.initialize(log_lengthscale=-4) covar_module = GridInterpolationKernel(rbf_covar) covar_module.initialize_interpolation_grid(10, grid_bounds=(0, 1)) covar_x = covar_module.forward(x.unsqueeze(1), x.unsqueeze(1)) c = covar_x.c.data T = utils.toeplitz.sym_toeplitz(c) W_left = index_coef_to_sparse(covar_x.J_left, covar_x.C_left, len(c)) W_right = index_coef_to_sparse(covar_x.J_right, covar_x.C_right, len(c)) W_left_dense = W_left.to_dense() W_right_dense = W_right.to_dense() WTW = W_left_dense.matmul(T.matmul(W_right_dense.t())) + torch.eye(len(x)) * 1e-4 quad_form_actual = y.dot(WTW.inverse().matmul(y)) chol_T = torch.potrf(WTW) log_det_actual = chol_T.diag().log().sum() * 2 actual = -0.5 * (log_det_actual + quad_form_actual + math.log(2 * math.pi) * len(y)) res = InterpolatedToeplitzGPMarginalLogLikelihood(W_left, W_right, num_samples=1000)(Variable(c), Variable(y), Variable(noise)).data assert all(torch.abs((res - actual) / actual) < 0.05)
def generate_momentum_wrap(metric, var_vec=None, Cov=None, V=None, alpha=None): # returns tensor if (metric == "unit_e"): def generate(q): return (torch.randn(len(q))) elif (metric == "diag_e"): sd = torch.sqrt(var_vec) inv_sd = 1 / sd def generate(q): return (torch.randn(len(q)) * inv_sd) elif (metric == "dense_e"): L = torch.potrf(Cov, upper=False) L_inv = torch.inverse(L) def generate(q): return (torch.mv(L_inv, torch.randn(len(q)))) elif (metric == "softabs"): def generate(q): lam, Q = eigen(getH(q, V).data) temp = torch.mm(Q, torch.diag(torch.sqrt(softabs_map(lam, alpha)))) out = torch.mv(temp, torch.randn(len(lam))) return (out) else: # should raise error here return ("error") return generate
def train_locator_model(self, model_XTX, model_XTY, model=None): if model is None: model = torch.potrs(model_XTY, torch.potrf(model_XTX)) else: for _ in range(30): model, _ = torch.trtrs(model_XTY - torch.mm(torch.triu(model_XTX, diagonal=1), model), torch.tril(model_XTX, diagonal=0), upper=False) return model
def forward(ctx, a, upper=True): ctx.upper = upper fact = torch.potrf(a, upper) ctx.save_for_backward(fact) return fact