def __call__(self, f): """ Compute the following function: E(f) = ||f_l - y_l||^2 + mu f^T L f + mu eps ||f||^2, :param f: Theano tensor Vector of N continuous elements. :return: Theano tensor Energy (cost) of the vector f. """ # Compute the un-normalized graph Laplacian: L = D - W D = T.diag(self.W.sum(axis=0)) L = D - self.W # Compute the label consistency S = T.diag(self.L) El = (f - self.y).T.dot(S.dot(f - self.y)) # Compute the smoothness along the similarity graph I = T.eye(self.L.shape[0]) Es = f.T.dot(L.dot(f)) + self.eps * f.T.dot(I.dot(f)) # Compute the whole cost function E = El + self.mu * Es return E
def L_op(self, inputs, outputs, gradients): # Modified from theano/tensor/slinalg.py # No handling for on_error = 'nan' dz = gradients[0] chol_x = outputs[0] # this is for nan mode # # ok = ~tensor.any(tensor.isnan(chol_x)) # chol_x = tensor.switch(ok, chol_x, 1) # dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return gpu_solve_upper_triangular( outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) return [grad]
def SampleKsi(d, u, mu, eps): # icml14SBP(20) dn = 1.0/d uDnu = T.sum(u*u*dn) coeff = ( 1-1.0/T.sqrt(1.0+uDnu) ) / (uDnu+SMALLNUM) u = u.reshape((u.shape[0],1)) R = T.diag(T.sqrt(dn)) - coeff*T.dot( T.dot(T.diag(dn),T.dot(u,u.T)), T.diag(T.sqrt(dn)) ) return mu + T.dot(R,eps)
def retr(self, X, Z, t=None): U, S, V = X Up, M, Vp = Z if t is None: t = 1.0 Qu, Ru = tensor.nlinalg.qr(Up) # we need rq decomposition here Qv, Rv = tensor.nlinalg.qr(Vp[::-1].T) Rv = Rv.T[::-1] Rv = Rv[:, ::-1] Qv = Qv.T[::-1] # now we have rq decomposition (Rv @ Qv = Z.Vp) #Rv, Qv = rq(Z.Vp, mode='economic') zero_block = tensor.zeros((Ru.shape[0], Rv.shape[1])) block_mat = tensor.stack( ( tensor.stack((S + t * M, t * Rv), 1).reshape((Rv.shape[0], -1)), tensor.stack((t * Ru, zero_block), 1).reshape((Ru.shape[0], -1)) ) ).reshape((-1, Ru.shape[1] + Rv.shape[1])) Ut, St, Vt = tensor.nlinalg.svd(block_mat, full_matrices=False) U_res = tensor.stack((U, Qu), 1).reshape((Qu.shape[0], -1)).dot(Ut[:, :self._k]) V_res = Vt[:self._k, :].dot(tensor.stack((V, Qv), 0).reshape((-1, Qv.shape[1]))) # add some machinery eps to get a slightly perturbed element of a manifold # even if we have some zeros in S S_res = tensor.diag(St[:self._k]) + tensor.diag(np.spacing(1) * tensor.ones(self._k)) return (U_res, S_res, V_res)
def retr(self, X, Z, t=None): if t is None: t = 1.0 Qu, Ru = tensor.nlinalg.QRFull(Z.Up) # we need rq decomposition here Qv, Rv = tensor.nlinalg.QRFull(Z.Vp[::-1].T) Rv = Rv.T[::-1] Rv[:, :] = Rv[:, ::-1] Qv = Qv.T[::-1] # now we have rq decomposition (Rv @ Qv = Z.Vp) #Rv, Qv = rq(Z.Vp, mode='economic') zero_block = tensor.zeros((Ru.shape[0], Rv.shape[1])) block_mat = tensor.stack( ( tensor.stack((X.S + t * Z.M, t * Rv), 1).reshape((Rv.shape[0], -1)), tensor.stack((t * Ru, zero_block), 1).reshape((Ru.shape[0], -1)) ) ).reshape((-1, Ru.shape[1] + Rv.shape[1])) Ut, St, Vt = tensor.nlinalg.svd(block_mat, full_matrices=False) U = tensor.stack((X.U, Qu), 1).reshape((Qu.shape[0], -1)).dot(Ut[:, :self._k]) V = Vt[:self._k, :].dot(tensor.stack((X.V, Qv), 0).reshape((-1, Qv.shape[1]))) # add some machinery eps to get a slightly perturbed element of a manifold # even if we have some zeros in S S = tensor.diag(St[:self._k]) + tensor.diag(np.spacing(1) * tensor.ones(self._k)) return ManifoldElementShared.from_vars((U, S, V), shape=(self._m, self._n), r=self._k)
def ehess2rhess(self, X, egrad, ehess, H): # Euclidean part rhess = self.proj(X, ehess) Sinv = tensor.diag(1.0 / tensor.diag(X.S)) # Curvature part T = self.apply_ambient(egrad, H.Vp.T).dot(Sinv) rhess.Up += (T - X.U.dot(X.U.T.dot(T))) T = self.apply_ambient_transpose(egrad, H.Up).dot(Sinv) rhess.Vp += (T - X.V.T.dot(X.V.dot(T))).T return rhess
def __call__(self, A, b, inference=False): dA = T.diagonal(A) D = T.diag(dA) R = A - D iD = T.diag(1.0 / dA) x = T.zeros_like(b) for i in range(self.iterations): x = iD.dot(b - R.dot(x)) return x
def _global_error(self, targetM, i, lastM): mask = T.neq(self._y[self._set[:, 1]], self._y[self._set[:, 2]]) f = T.nnet.sigmoid # T.tanh g = lambda x, y: x*(1-y) #lambda x: T.maximum(x, 0) # g(lst_prediction - cur_prediction) # f(T.diag(lossil - lossij)) if i == 0: # pull_error for global 0 pull_error = 0. ivectors = self._stackx[:, i, :][self._neighborpairs[:, 0]] jvectors = self._stackx[:, i, :][self._neighborpairs[:, 1]] diffv = ivectors - jvectors pull_error = linalg.trace(diffv.dot(targetM).dot(diffv.T)) else: ivectors = self._stackx[:, i, :][self._neighborpairs[:, 0]] jvectors = self._stackx[:, i, :][self._neighborpairs[:, 1]] diffv1 = ivectors - jvectors distMcur = diffv1.dot(targetM).dot(diffv1.T) # ivectors = self._stackx[:, i-1, :][self._neighborpairs[:, 0]] # jvectors = self._stackx[:, i-1, :][self._neighborpairs[:, 1]] # diffv2 = ivectors - jvectors # distMlast = diffv2.dot(lastM).dot(diffv2.T) pull_error = linalg.trace(T.maximum(distMcur, 0)) push_error = 0.0 ivectors = self._stackx[:, i, :][self._set[:, 0]] jvectors = self._stackx[:, i, :][self._set[:, 1]] lvectors = self._stackx[:, i, :][self._set[:, 2]] diffij = ivectors - jvectors diffil = ivectors - lvectors lossij = diffij.dot(targetM).dot(diffij.T) lossil = diffil.dot(targetM).dot(diffil.T) #cur_prediction = T.diag(lossij - lossil) cur_prediction = f(T.diag(lossil - lossij)) ivectors = self._stackx[:, i-1, :][self._set[:, 0]] jvectors = self._stackx[:, i-1, :][self._set[:, 1]] lvectors = self._stackx[:, i-1, :][self._set[:, 2]] diffij = ivectors - jvectors diffil = ivectors - lvectors if i == 0: lossij = diffij.dot(diffij.T) lossil = diffil.dot(diffil.T) else: lossij = diffij.dot(lastM).dot(diffij.T) lossil = diffil.dot(lastM).dot(diffil.T) lst_prediction = f(T.diag(lossil - lossij)) push_error = T.sum(mask*(g(lst_prediction, cur_prediction))) return pull_error, push_error
def from_partial_old(self, X, dX): eps = 1e-10#np.spacing(1) U, S, V = X dU, dS, dV = dX S = tensor.diag(S) S_pinv = tensor.switch(tensor.gt(abs(S), eps), 1.0 / S, 0.0) S_pinv = tensor.diag(S_pinv) ZV = dU.dot(S_pinv) UtZV = dS ZtU = S_pinv.dot(dV) Zproj = (ZV - U.dot(UtZV), UtZV, ZtU - (UtZV.dot(V))) return Zproj
def diagCholInvLogDet_fromDiag(diag_vec, name): diag_mat = T.diag(diag_vec.flatten()) inv = T.diag(1.0 / diag_vec.flatten()) chol = T.diag(T.sqrt(diag_vec.flatten())) logDet = T.sum(T.log(diag_vec.flatten())) # scalar diag_mat.name = name chol.name = "c" + name inv.name = "i" + name logDet.name = "logDet" + name return (diag_mat, chol, inv, logDet)
def diagCholInvLogDet_fromLogDiag(logdiag, name): diag = T.diag(T.exp(logdiag.flatten())) inv = T.diag(T.exp(-logdiag.flatten())) chol = T.diag(T.exp(0.5 * logdiag.flatten())) logDet = T.sum(logdiag) # scalar diag.name = name chol.name = "c" + name inv.name = "i" + name logDet.name = "logDet" + name return (diag, chol, inv, logDet)
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) # Replace the cholesky decomposition with 1 if there are nans # or solve_upper_triangular will throw a ValueError. if self.on_error == 'nan': ok = ~tensor.any(tensor.isnan(chol_x)) chol_x = tensor.switch(ok, chol_x, 1) dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return solve_upper_triangular( outer.T, solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) if self.on_error == 'nan': return [tensor.switch(ok, grad, np.nan)] else: return [grad]
def recurrence(x_t, h_tm1, c_tm1): i_t = TT.nnet.sigmoid(TT.dot(x_t, W_xi) + TT.dot(h_tm1, W_hi) + TT.dot(c_tm1, TT.diag(W_ci)) + b_i) f_t = TT.nnet.sigmoid(TT.dot(x_t, W_xf) + TT.dot(h_tm1, W_hf) + TT.dot(c_tm1, TT.diag(W_cf)) + b_f) c_t = f_t * c_tm1 + i_t * TT.tanh(TT.dot(x_t, W_xc) + TT.dot(h_tm1, W_hc) + b_c) o_t = TT.nnet.sigmoid(TT.dot(x_t, W_xo) + TT.dot(h_tm1, W_ho) + TT.dot(c_t, TT.diag(W_co)) + b_o) h_t = o_t * TT.tanh(c_t) return h_t, c_t
def log_p_y_I_zA(self): sum_y_outers = T.sum(self.Y**2) sum_z_IBP_mean_phi_y = T.sum( T.dot( (T.dot(self.phi_IBP, self.Y.T)).T,self.z_IBP_mean ) ) # sum_z_IBP_mean_phi_outer = T.tril(T.dot(z_IBP_mean.T, z_IBP_mean)) * T.tril() # sum_z_IBP_mean_phi_Phi = T.sum( T.dot(z_IBP_mean.T, (self.Phi_traces+T.sum(self.phi_IBP**2, 1)) ) ) sum_2ndOrder_term = T.sum( T.dot(self.z_IBP_samp.T, T.dot(T.dot(self.phi_IBP, self.phi_IBP.T) + T.diag(T.diag(self.get_tensor_traces_scan(self.Phi_IBP))), self.z_IBP_samp)) ) term = -0.5*self.D*self.B*(log2pi*self.sigma_y**2) \ -0.5*(self.sigma_y**-2)*(sum_y_outers -2*sum_z_IBP_mean_phi_y \ + sum_2ndOrder_term) return term
def get_output_for(self, input, **kwargs): xin_shape = input.shape if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.zeros((input.shape[0], self.shape1[1] * self.shape2[1])) s = T.diag(T.sqrt(T.diag(self.S))) u = self.U.dot(s) w = s.dot(self.V) for i in range(self.manifold._k): activation += apply_mat_to_kron(input, u[:, i].reshape((self.shape1[::-1])).T, w[i, :].reshape((self.shape2[::-1])).T) return activation
def _compile_func(): beta = T.vector('beta') b = T.scalar('b') X = T.matrix('X') y = T.vector('y') C = T.scalar('C') params = [beta, b, X, y, C] cost = 0.5 * (T.dot(beta, beta) + b * b) + C * T.sum( T.nnet.softplus( -T.dot(T.diag(y), T.dot(X, beta) + b) ) ) # Function computing in one go the cost, its gradient # with regard to beta and with regard to the bias. cost_grad = theano.function(params,[ cost, T.grad(cost, beta), T.grad(cost, b) ]) # Function for computing element-wise sigmoid, used for # prediction. log_predict = theano.function( [beta, b, X], T.nnet.sigmoid(b + T.dot(X, beta)), on_unused_input='warn' ) return (cost_grad, log_predict)
def propagate(f, l, R, mu, eps): # The similarity matrix W is a linear combination of the slices in R W = T.tensordot(R, mu, axes=1) # The following indices correspond to labeled and unlabeled examples labeled = T.eq(l, 1).nonzero() unlabeled = T.eq(l, 0).nonzero() # Calculating the graph Laplacian of W D = T.diag(W.sum(axis=0)) L = D - W # Computing L_UU (the Laplacian over unlabeled examples) L_UU = L[unlabeled][:, unlabeled][:, 0, :] # Computing the inverse of the (regularized) Laplacian iA = (L_UU + epsI)^-1 epsI = eps * T.eye(L_UU.shape[0]) rL_UU = L_UU + epsI iA = nlinalg.matrix_inverse(rL_UU) # Computing W_UL (the similarity matrix between unlabeled and labeled examples) W_UL = W[unlabeled][:, labeled][:, 0, :] f_L = f[labeled] # f* = (L_UU + epsI)^-1 W_UL f_L f_star = iA.dot(W_UL.dot(f_L)) return f_star
def dot(self, other): if isinstance(other, ManifoldElement): mid = self.S.dot(self.V.dot(other.U)).dot(other.S) U, S, V = tensor.nlinalg.svd(mid, full_matrices=False) return ManifoldElement(self.U.dot(U), tensor.diag(self.S), V.dot(self.V)) else: raise ValueError('dot must be performed on ManifoldElements.')
def censor_updates(updates): w = updates[1][0] updated_w = updates[1][1] constrained_w = T.dot(updated_w, T.diag(1 / T.sqrt(T.sum(updated_w ** 2, axis=0)))) new_update = [updates[0], (w, constrained_w)] return new_update
def check_jacobian_det(transform, domain, constructor=tt.dscalar, test=0, make_comparable=None, elemwise=False): y = constructor('y') y.tag.test_value = test x = transform.backward(y) if make_comparable: x = make_comparable(x) if not elemwise: jac = tt.log(tt.nlinalg.det(jacobian(x, [y]))) else: jac = tt.log(tt.abs_(tt.diag(jacobian(x, [y])))) # ljd = log jacobian det actual_ljd = theano.function([y], jac) computed_ljd = theano.function([y], tt.as_tensor_variable( transform.jacobian_det(y)), on_unused_input='ignore') for yval in domain.vals: close_to( actual_ljd(yval), computed_ljd(yval), tol)
def likelihood(f, l, R, mu, eps, sigma2, lambda_1=1e-4): # The similarity matrix W is a linear combination of the slices in R W = T.tensordot(R, mu, axes=1) # The following indices correspond to labeled and unlabeled examples labeled = T.eq(l, 1).nonzero() # Calculating the graph Laplacian of W D = T.diag(W.sum(axis=0)) L = D - W # The Covariance (or Kernel) matrix is the inverse of the (regularized) Laplacian epsI = eps * T.eye(L.shape[0]) rL = L + epsI Sigma = nlinalg.matrix_inverse(rL) # The marginal density of labeled examples uses Sigma_LL as covariance (sub-)matrix Sigma_LL = Sigma[labeled][:, labeled][:, 0, :] # We also consider additive Gaussian noise with variance sigma2 K_L = Sigma_LL + (sigma2 * T.eye(Sigma_LL.shape[0])) # Calculating the inverse and the determinant of K_L iK_L = nlinalg.matrix_inverse(K_L) dK_L = nlinalg.det(K_L) f_L = f[labeled] # The (L1-regularized) log-likelihood is given by the summation of the following four terms term_A = - (1 / 2) * f_L.dot(iK_L.dot(f_L)) term_B = - (1 / 2) * T.log(dK_L) term_C = - (1 / 2) * T.log(2 * np.pi) term_D = - lambda_1 * T.sum(abs(mu)) return term_A + term_B + term_C + term_D
def _build_marginal_likelihood_logp(self, y, X, Xu, sigma): sigma2 = tt.square(sigma) Kuu = self.cov_func(Xu) Kuf = self.cov_func(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if self.approx == "FITC": Kffd = self.cov_func(X, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 trace = 0.0 elif self.approx == "VFE": Lamd = tt.ones_like(Qffd) * sigma2 trace = ((1.0 / (2.0 * sigma2)) * (tt.sum(self.cov_func(X, diag=True)) - tt.sum(tt.sum(A * A, 0)))) else: # DTC Lamd = tt.ones_like(Qffd) * sigma2 trace = 0.0 A_l = A / Lamd L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A))) r = y - self.mean_func(X) r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) constant = 0.5 * X.shape[0] * tt.log(2.0 * np.pi) logdet = 0.5 * tt.sum(tt.log(Lamd)) + tt.sum(tt.log(tt.diag(L_B))) quadratic = 0.5 * (tt.dot(r, r_l) - tt.dot(c, c)) return -1.0 * (constant + logdet + quadratic + trace)
def mvnorm_logpdf(x, mu = None, Li = None): """ Parameters ++++++++++ mu - mean of MVN, if not given assume zero mean Li - inverse of lower cholesky """ import autograd.numpy as T dim = Li.shape[0] Ki = np.dot(Li.T, Li) #determinant is just multiplication of diagonal elements of cholesky logdet = 2*T.log(1./T.diag(Li)).sum() lpdf_const = -0.5 * (dim * T.log(2 * np.pi) + logdet) if mu is None: d = T.reshape(x, (dim, 1)) else: d = (x - mu.reshape((1 ,dim))).T Ki_d = T.dot(Ki, d) #vector res_pdf = (lpdf_const - 0.5 * diag_dot(d.T, Ki_d)).T if res_pdf.size == 1: res_pdf = res_pdf[0] return res_pdf
def log_likelihood(self): Users = self.U[:, :-1] Middle = self.S Items = self.V[:-1, :] UserBiases = self.U[:, -1].reshape((-1, 1)) ItemBiases = self.V[-1, :].reshape((-1, 1)) A = T.dot(T.dot(self.U[:, :-1], self.S[:-1, :-1]), self.V[:-1, :]) A = T.inc_subtensor(A[:, :], UserBiases * T.sqrt(self.S[-1, -1])) A = T.inc_subtensor(A[:, :], ItemBiases.T * T.sqrt(self.S[-1, -1])) B = A * self.counts loglik = T.sum(B) A = T.exp(A) A += 1 A = T.log(A) A = (self.counts + 1) * A loglik -= T.sum(A) # L2 regularization loglik -= 0.5 * self.reg_param * T.sum(T.square(T.diag(self.S)[:-1])) # Return negation of LogLikelihood cause we will minimize cost return -loglik
def log_mvn(self, y, mean,beta):#対角ノイズ、YはN×Dのデータ,それの正規分布の対数尤度 N = y.shape[0] D = y.shape[1] LL, updates = theano.scan(fn=lambda a: -0.5 * D * T.sum(T.log(2 * np.pi*(1/T.diag(beta)))) - 0.5 * T.sum(T.dot(beta,(y - a)**2)), sequences=[mean]) return T.mean(LL)
def likelihood_domain(self,target,Xlabel): self.beta = T.exp(self.ls) betaI=T.diag(T.dot(Xlabel,self.beta)) Covariance = betaI LL = self.log_mvn(target, self.output, Covariance)# - 0.5*T.sum(T.dot(betaI,Ktilda)) return LL
def SPD_Project(mat): # force symmetric mat = (mat+mat.T)/2.0 eig, eigv = linalg.eig(mat) eig = T.maximum(eig, 0) eig = T.diag(eig) return eigv.dot(eig).dot(eigv.T)
def l2ls_learn_basis_dual(X, S, c): tX = T.matrix('X') tS = T.matrix('S') tc = T.scalar('c') tlambdas = T.vector('lambdas') tXST = T.dot(tX, tS.T) tSSTetc = la.matrix_inverse(T.dot(tS, tS.T) + T.diag(tlambdas)) objective = -(T.dot(tX, tX.T).trace() - reduce(T.dot, [tXST, tSSTetc, tXST.T]).trace() - tc*tlambdas.sum()) objective_fn = theano.function([tlambdas], objective, givens={tX: X, tS: S, tc: c}) objective_grad_fn = theano.function([tlambdas], T.grad(objective, tlambdas), givens={tX: X, tS: S, tc: c}) initial_lambdas = 10*np.abs(np.random.random((S.shape[0], 1))) output = scipy.optimize.fmin_cg(f=objective_fn, fprime=objective_grad_fn, x0=initial_lambdas, maxiter=100, full_output=True) logging.debug("optimizer stats %s" % (output[1:],)) logging.debug("optimizer lambdas %s" % output[0]) lambdas = output[0] B = np.dot(np.linalg.inv(np.dot(S, S.T) + np.diag(lambdas)), np.dot(S, X.T)).T return B
def _theano_project_sd(self, mat): # force symmetric mat = (mat+mat.T)/2.0 eig, eigv = linalg.eig(mat) eig = T.maximum(eig, 0) eig = T.diag(eig) return eigv.dot(eig).dot(eigv.T)
def forward_batch_step(x_t, H_mask, H_tm1): H = TT.dot(W_rec,H_tm1) + W_in[:,x_t] H_t = TT.nnet.sigmoid(H) Y_t = TT.nnet.softmax(TT.transpose(TT.dot(W_out, H_t))) Y_t = -TT.log2(Y_t) Y_t = TT.dot(TT.transpose(Y_t), TT.diag(H_mask)) return [H_t, Y_t]
def make_model(cls): with pm.Model() as model: sd_mu = np.array([1, 2, 3, 4, 5]) sd_dist = pm.Lognormal.dist(mu=sd_mu, sigma=sd_mu / 10., shape=5) chol_packed = pm.LKJCholeskyCov('chol_packed', eta=3, n=5, sd_dist=sd_dist) chol = pm.expand_packed_triangular(5, chol_packed, lower=True) cov = tt.dot(chol, chol.T) stds = tt.sqrt(tt.diag(cov)) pm.Deterministic('log_stds', tt.log(stds)) corr = cov / stds[None, :] / stds[:, None] corr_entries_unit = (corr[np.tril_indices(5, -1)] + 1) / 2 pm.Deterministic('corr_entries_unit', corr_entries_unit) return model
def merge_factors(self, X, Z=None, diag=False): factor_list = [] for factor in self.factor_list: if isinstance(factor, Covariance): factor_list.append(factor(X, Z, diag)) elif hasattr(factor, "ndim"): if diag: factor_list.append(tt.diag(factor)) else: factor_list.append(factor) else: factor_list.append(factor) return factor_list
def lanczos(linear_op, z, m, batch_size): s = z.norm(2, axis=1) v = z / s.dimshuffle(0, 'x') alpha = [] beta = [] V = [] V.append(v) v_curr = v b = None v_prev = None for j in xrange(m): if j == 0: r = linear_op(v_curr) else: r = linear_op(v_curr) - b.dimshuffle(0, 'x') * v_prev a = T.batched_dot(v_curr, r) r = r - a.dimshuffle(0, 'x') * v_curr b = r.norm(2, axis=1) v_prev = v_curr v_curr = r / b.dimshuffle(0, 'x') alpha.append(a) if j < m - 1: V.append(v_curr) beta.append(b) Az_list = [] for idx in xrange(batch_size): alpha_diag = T.diag(T.stacklists([a_[idx] for a_ in alpha])) beta_diag = T.diag(T.stacklists([b_[idx] for b_ in beta] + [0])) M = alpha_diag + T.roll(beta_diag, 1, 0) + T.roll(beta_diag, 1, 1) V_matrix = T.stacklists([v_[idx] for v_ in V]).T approx_sqrt = s[idx] * V_matrix.dot(theano_sqrtm(M)[:, 0]) Az_list.append(approx_sqrt) Azs = T.stacklists(Az_list) return Azs
def get_att(X, index): """ Input attention, single sentence. Args: X: tensor, shape=[n, embed_dim] index: int, target index Return: tensor, shape=[n, embed_dim] """ result, update = theano.scan(lambda v, u: T.dot(v, T.transpose(u)), sequences=X, non_sequences=X[index]) result_soft = T.nnet.softmax(result) A = T.diag(T.flatten(result_soft)) # n×n return T.dot(A, X) # [n, embed_dim]
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return solve_upper_triangular( outer.T, solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: return [tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))] else: return [tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))]
def _create_theano_likelihood_graph(data, t, ind_t, n_time, n_inducing_time, approx='FITC'): """ Here we use theano to compile a comutational graph defining our discrepancy likelihood. Note it just compiles this graph as a C program which will get successively called in pints. Thus all the variables defined here are simply placeholders. """ rho = tt.dscalar('rho') ker_sigma = tt.dscalar('ker_sigma') sigma = tt.dscalar('sigma') time = theano.tensor.as_tensor_variable(t) inducing_time = theano.tensor.as_tensor_variable(ind_t) y = theano.tensor.as_tensor_variable(data) current = tt.dvector('current') cov_func = RbfKernel(rho, ker_sigma) sigma2 = tt.square(sigma) Kuu = cov_func(inducing_time) Kuf = cov_func(inducing_time, time) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if approx == 'FITC': Kffd = cov_func(time, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 trace = 0.0 elif approx == 'VFE': Lamd = tt.ones_like(Qffd) * sigma2 trace = ((1.0 / (2.0 * sigma2)) * (tt.sum(cov_func(time, diag=True)) - tt.sum(tt.sum(A * A, 0)))) else: # DTC Lamd = tt.ones_like(Qffd) * sigma2 trace = 0.0 A_l = A / Lamd L_B = cholesky(tt.eye(n_inducing_time) + tt.dot(A_l, tt.transpose(A))) r = y - current r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) constant = 0.5 * n_time * tt.log(2.0 * np.pi) logdet = 0.5 * tt.sum(tt.log(Lamd)) + tt.sum(tt.log(tt.diag(L_B))) quadratic = 0.5 * (tt.dot(r, r_l) - tt.dot(c, c)) ll = -1.0 * (constant + logdet + quadratic + trace) return theano.function([current,rho,ker_sigma,sigma],ll,on_unused_input='ignore')
def get_model(self, lengthscale_trf, lengthscale_p_trf, sn_trf, sf_trf, S, MU, SIGMA_trf, U, b, X, Q, D, N, M): EPhi, EPhiTPhi = self.get_EPhi(lengthscale_trf, lengthscale_p_trf, sf_trf, S, MU, SIGMA_trf, U, b, N, M) EPhiTPhi_reg = self.reg_EPhi(lengthscale_trf, lengthscale_p_trf, sf_trf, S, MU, SIGMA_trf, U, b, N, M, D) K_MM = self.kernel_gauss(U, lengthscale_trf, lengthscale_p_trf, sf_trf) XT_EPhi = X.T.dot(EPhi) opt_A_mean, cholSigInv, cholK_MM, InvK_MM = self.get_opt_A( sn_trf, EPhiTPhi, XT_EPhi, K_MM) LL = -0.5 * (D * ( (N - M) * T.log(sn_trf) + N * np.log(2 * np.pi) - T.sum(2 * T.log( T.diag(cholK_MM))) + T.sum(2 * T.log(T.diag(cholSigInv))) + (N * sf_trf - T.sum(T.diag(InvK_MM.dot(EPhiTPhi_reg)))) / sn_trf) + T.sum(X**2) / sn_trf - T.sum(opt_A_mean.T * XT_EPhi) / sn_trf) KL_X = -0.5 * (T.log(2 * np.pi * SIGMA_trf) + 1).sum() + 0.5 * (np.log( 2 * np.pi)) + 0.5 * (SIGMA_trf + MU**2).sum() return LL, KL_X
def solve(self, X, flux, cho_C, mu, LInv): """ Compute the maximum a posteriori (MAP) prediction for the spherical harmonic coefficients of a map given a flux timeseries. Args: X (matrix): The flux design matrix. flux (array): The flux timeseries. cho_C (scalar/vector/matrix): The lower cholesky factorization of the data covariance. mu (array): The prior mean of the spherical harmonic coefficients. LInv (scalar/vector/matrix): The inverse prior covariance of the spherical harmonic coefficients. Returns: The vector of spherical harmonic coefficients corresponding to the MAP solution and the Cholesky factorization of the corresponding covariance matrix. """ # Compute C^-1 . X if cho_C.ndim == 0: CInvX = X / cho_C**2 elif cho_C.ndim == 1: CInvX = tt.dot(tt.diag(1 / cho_C**2), X) else: CInvX = _cho_solve(cho_C, X) # Compute W = X^T . C^-1 . X + L^-1 W = tt.dot(tt.transpose(X), CInvX) if LInv.ndim == 0: W = tt.inc_subtensor( W[tuple((tt.arange(W.shape[0]), tt.arange(W.shape[0])))], LInv) LInvmu = mu * LInv elif LInv.ndim == 1: W = tt.inc_subtensor( W[tuple((tt.arange(W.shape[0]), tt.arange(W.shape[0])))], LInv) LInvmu = mu * LInv else: W += LInv LInvmu = tt.dot(LInv, mu) # Compute the max like y and its covariance matrix cho_W = sla.cholesky(W) M = _cho_solve(cho_W, tt.transpose(CInvX)) yhat = tt.dot(M, flux) + _cho_solve(cho_W, LInvmu) ycov = _cho_solve(cho_W, tt.eye(cho_W.shape[0])) cho_ycov = sla.cholesky(ycov) return yhat, cho_ycov
def GrabProbs(classProbs, target, gRange=None): if classProbs.ndim > 2: classProbs = classProbs.reshape((classProbs.shape[0] * classProbs.shape[1], classProbs.shape[2])) else: classProbs = classProbs if target.ndim > 1: tflat = target.flatten() else: tflat = target ### Hack for Theano, much faster than [x, y] indexing ### avoids a copy onto the GPU return T.diag(classProbs.T[tflat])
def construct_likelihood(self): lower_idxs = np.tril_indices(self.data.shape[-1], k=-1) L = pm.expand_packed_triangular(self.ndim, self.model['packed_L']) Sigma = pm.Deterministic('Sigma', L.dot(L.T)) std = tt.sqrt(tt.diag(Sigma)) corr = Sigma / tt.outer(std, std) pm.Deterministic('corr_coeffs', corr[lower_idxs]) if self.data_covariances is None: pm.MvNormal('like', mu=self.model['mu'], chol=L, observed=self.residual_data) else: like = _multivariate_normal_convolution_likelihood(Sigma, self.model['mu'], self.residual_data, self.data_covariances) pm.Potential('like', like)
def ksd_eval(X0, h0, score_q, **model_params): X = sharedX(X0) h = sharedX(h0) Sqx = score_q(X, **model_params) H = sqr_dist(X, X) h = T.sqrt(h / 2.) V = H.flatten() # median distance h = T.switch( T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[((V.shape[0] / 2) - 1):((V.shape[0] / 2) + 1)]), # if odd vector T.sort(V)[V.shape[0] // 2]) # compute the rbf kernel Kxy = T.exp(-H / h**2 / 2.) Sqxdy = -(T.dot(Sqx, X.T) - T.tile( T.sum(Sqx * X, axis=1).dimshuffle(0, 'x'), (1, X.shape[0]))) / (h**2) dxSqy = T.transpose(Sqxdy) dxdy = (-H / (h**4) + X.shape[1].astype(theano.config.floatX) / (h**2)) M = (T.dot(Sqx, Sqx.T) + Sqxdy + dxSqy + dxdy) * Kxy M2 = M - T.diag(T.diag(M)) ksd_u = T.sum(M2) / (X.shape[0] * (X.shape[0] - 1)) ksd_v = T.sum(M) / (X.shape[0]**2) f = theano.function(inputs=[], outputs=[ksd_u, ksd_v]) return f()
def L_op(self, inputs, outputs, gradients): # Modified from theano/tensor/slinalg.py # No handling for on_error = 'nan' dz = gradients[0] chol_x = outputs[0] # this is for nan mode # # ok = ~tensor.any(tensor.isnan(chol_x)) # chol_x = tensor.switch(ok, chol_x, 1) # dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.0) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return gpu_solve_upper_triangular( outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T ) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)) ) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) return [grad]
def compileFun(model_name, dataset_name, pooling_mode): print "model_name: ", model_name print "dataset_name: ", dataset_name print "pooling_mode: ", pooling_mode print "Started!" rng = numpy.random.RandomState(23455) sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") # docLabel = T.ivector('docLabel') # for list-type data layer0 = DocEmbeddingNNOneDoc(corpus, sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=100, \ sentenceLayerNodesSize=[5, 200], \ docLayerNodesNum=100, \ docLayerNodesSize=[3, 100], pooling_mode=pooling_mode) layer1_output_num = 100 layer1 = HiddenLayer( rng, input=layer0.output, n_in=layer0.outputDimension, n_out=layer1_output_num, activation=T.tanh ) layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2) cost = layer2.negative_log_likelihood(1 - layer2.y_pred) # calculate sentence sentence_score sentence_grads = T.grad(cost, layer0.sentenceResults) sentence_score = T.diag(T.dot(sentence_grads, T.transpose(layer0.sentenceResults))) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. model_path = "data/" + dataset_name + "/" + model_name + "/" + pooling_mode + ".model" loadParamsVal(model_path, params) print "Compiling computing graph." output_model = theano.function( [corpus, sentenceWordCount], [layer2.y_pred, sentence_score] ) print "Compiled." return output_model
def __init__(self, rng, embedding, idx_context, gamma, dim_emb, dict_size): """ Initialize the parameters of the logistic regression :type n_in: int :param n_in: """ # initialize with 0 the weights W as a matrix of shape (n_in, n_out) W_values = numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (dict_size + dim_emb)), high=numpy.sqrt(6. / (dict_size + dim_emb)), size=(dict_size, dim_emb) ), dtype=theano.config.floatX ) self.W = theano.shared( W_values*0., # value=numpy.zeros( # (dict_size, dim_emb), # dtype=theano.config.floatX # ), name='SoftmaxW', borrow=True ) self.params = [self.W] self.prediction = T.nnet.sigmoid( gamma * T.diag(T.dot(embedding, self.W[idx_context].T)) ) self.regul = T.mean (T.diag(T.dot(embedding, embedding.T) + T.dot(self.W[idx_context], self.W[idx_context].T))) self.negLogLikelihood = -T.mean(T.log(self.prediction))
def __init__(self, input_dim, W=None, kappa=None, B=None, active_dims=None): super(Coregion, self).__init__(input_dim, active_dims) if len(self.active_dims) != 1: raise ValueError('Coregion requires exactly one dimension to be active') make_B = W is not None or kappa is not None if make_B and B is not None: raise ValueError('Exactly one of (W, kappa) and B must be provided to Coregion') if make_B: self.W = tt.as_tensor_variable(W) self.kappa = tt.as_tensor_variable(kappa) self.B = tt.dot(self.W, self.W.T) + tt.diag(self.kappa) elif B is not None: self.B = tt.as_tensor_variable(B) else: raise ValueError('Exactly one of (W, kappa) and B must be provided to Coregion')
def psd(self, omega): ar, cr, ac, bc, cc, dc = self.term.coefficients omega = tt.reshape(omega, tt.concatenate([omega.shape, [1]]), ndim=omega.ndim + 1) w2 = omega**2 w02 = cc**2 + dc**2 power = tt.sum(ar * cr / (cr**2 + w2), axis=-1) power += tt.sum( ((ac * cc + bc * dc) * w02 + (ac * cc - bc * dc) * w2) / (w2 * w2 + 2.0 * (cc**2 - dc**2) * w2 + w02 * w02), axis=-1, ) psd = np.sqrt(2.0 / np.pi) * power return psd[:, None] * tt.diag(self.R)
def step(self, mask, input_term, forget_term, output_term, cell_term, h_pre, c_pre): input_term += T.dot(h_pre, self.lstm_input_wh) + T.dot(c_pre, T.diag(self.lstm_input_wc)) forget_term += T.dot(h_pre, self.lstm_forget_wh) + T.dot(c_pre, T.diag(self.lstm_forget_wc)) input_term += self.lstm_input_b forget_term += self.lstm_forget_b input_gate = T.nnet.sigmoid(input_term) forget_gate = T.nnet.sigmoid(forget_term) cell_term += T.dot(h_pre, self.lstm_cell_wh) cell_term += self.lstm_cell_b c = forget_gate * c_pre + input_gate * T.tanh(cell_term) output_term += T.dot(h_pre, self.lstm_output_wh) + T.dot(c, T.diag(self.lstm_output_wc)) output_term += self.lstm_output_b output_gate = T.nnet.sigmoid(output_term) h = output_gate * T.tanh(c) return h, c
def MMD_kenel_Xonly(self,gamma,Label,Knn,Weight): Dn=Label.shape[1] DD1=T.tile(Label.T, (Dn, 1,1)) tttt=DD1[:,:,:,None]*DD1.transpose((1,0,2))[:,:,None,:] Hh=T.sum(T.sum(tttt*Knn[None,None,:,:],-1),-1) Hh=Hh*Weight GH=T.tile(T.diag(Hh),(Dn,1)) new=T.exp(-(GH.T+GH-2*Hh)/(2*gamma**2))#ここまででD×DのMMD距離になった。次はRBFカーネルにかける KK=tttt*new[:,:,None,None] #KK1=T.where(T.eq(KK,0),1,KK)#これはZ用。Xは一つしか0じゃない数はないが、Zが複数ある。全てを重みつきでかけたいが、0があると0になっちゃうので1に変換する KK2=T.sum(T.sum(KK,0),0) Kmmd_rbf=KK2*Knn#RBFカーネルにかける return Kmmd_rbf
def cholInvLogDet(A, dim, jitter, fast=False): A_jitter = A + jitter * T.eye(dim) cA = myCholesky()(A_jitter) cA.name = 'c' + A.name if fast: (iA, logDetA) = invLogDet(cA) else: iA = nlinalg.matrix_inverse(A_jitter) #logDetA = T.log( nlinalg.Det()(A_jitter) ) logDetA = 2.0 * T.sum(T.log(T.abs_(T.diag(cA)))) iA.name = 'i' + A.name logDetA.name = 'logDet' + A.name return (cA, iA, logDetA)
def evaluateLogDensity(self, X, Y): Ypred = theano.clone(self.rate, replace={self.Xsamp: X}) resY = Y - Ypred resX = X[1:] - T.dot(X[:(X.shape[0] - 1)], self.A.T) resX0 = X[0] - self.x0 LogDensity = -(0.5 * T.dot(resY.T, resY) * T.diag(self.Rinv)).sum() - ( 0.5 * T.dot(resX.T, resX) * self.Lambda).sum() - 0.5 * T.dot( T.dot(resX0, self.Lambda0), resX0.T) LogDensity += 0.5 * (T.log( self.Rinv)).sum() * Y.shape[0] + 0.5 * T.log(Tla.det( self.Lambda)) * (Y.shape[0] - 1) + 0.5 * T.log( Tla.det(self.Lambda0)) - 0.5 * ( self.xDim + self.yDim) * np.log(2 * np.pi) * Y.shape[0] return LogDensity
def get_celerite_matrices(self, x, diag): x = tt.as_tensor_variable(x) ar, cr, ac, bc, cc, dc = self.term.coefficients U = tt.concatenate( ( ar[None, :] + tt.zeros_like(x)[:, None], ac[None, :] * tt.cos(dc[None, :] * x[:, None]) + bc[None, :] * tt.sin(dc[None, :] * x[:, None]), ac[None, :] * tt.sin(dc[None, :] * x[:, None]) - bc[None, :] * tt.cos(dc[None, :] * x[:, None]), ), axis=1, ) V = tt.concatenate( ( tt.zeros_like(ar)[None, :] + tt.ones_like(x)[:, None], tt.cos(dc[None, :] * x[:, None]), tt.sin(dc[None, :] * x[:, None]), ), axis=1, ) if 'alpha' in vars(self): x = tt.reshape( tt.tile(x, (self.alpha.shape[0], 1)).T, (1, x.size * self.alpha.shape[0]))[0] dx = x[1:] - x[:-1] a = diag + (self.alpha**2)[:, None] * (tt.sum(ar) + tt.sum(ac)) a = tt.reshape(a.T, (1, a.size))[0] U = tt.slinalg.kron(U, self.alpha[:, None]) V = tt.slinalg.kron(V, self.alpha[:, None]) c = tt.concatenate((cr, cc, cc)) P = tt.exp(-c[None, :] * dx[:, None]) elif 'R' in vars(self): x = tt.reshape( tt.tile(x, (self.R.shape[0], 1)).T, (1, x.size * self.R.shape[0]))[0] dx = x[1:] - x[:-1] a = diag + tt.diag(self.R)[:, None] * (tt.sum(ar) + tt.sum(ac)) a = tt.reshape(a.T, (1, a.size))[0] U = tt.slinalg.kron(U, self.R) V = tt.slinalg.kron(V, tt.eye(self.R.shape[0])) c = tt.concatenate((cr, cc, cc)) P = tt.exp(-c[None, :] * dx[:, None]) P = tt.tile(P, (1, self.R.shape[0])) return a, U, V, P
def nlml(Y, hyp, X, X_sp, EyeM): # TODO allow for different pseudo inputs for each dimension # initialise the (before compilation) kernel function hyps = [hyp[:idims+1], hyp[idims+1]] kernel_func = partial(cov.Sum, hyps, self.covs) sf2 = hyp[idims]**2 sn2 = hyp[idims+1]**2 N = X.shape[0].astype(theano.config.floatX) ridge = 1e-6 Kmm = kernel_func(X_sp) + ridge*EyeM Kmn = kernel_func(X_sp, X) Lmm = cholesky(Kmm) rhs = tt.concatenate([EyeM, Kmn], axis=1) sol = solve_lower_triangular(Lmm, rhs) iKmm = solve_upper_triangular(Lmm.T, sol[:, :EyeM.shape[0]]) Lmn = sol[:, EyeM.shape[0]:] diagQnn = (Lmn**2).sum(0) # Gamma = diag(Knn - Qnn) + sn2*I Gamma = sf2 + sn2 - diagQnn Gamma_inv = 1.0/Gamma # these operations are done to avoid inverting Qnn+Gamma) sqrtGamma_inv = tt.sqrt(Gamma_inv) Lmn_ = Lmn*sqrtGamma_inv # Kmn_*Gamma^-.5 Yi = Y*(sqrtGamma_inv) # Gamma^-.5* Y # I + Lmn * Gamma^-1 * Lnm Bmm = tt.eye(Kmm.shape[0]) + (Lmn_).dot(Lmn_.T) Amm = cholesky(Bmm) LAmm = Lmm.dot(Amm) Kmn_dotYi = Kmn.dot(Yi*(sqrtGamma_inv)) rhs = tt.concatenate([EyeM, Kmn_dotYi[:, None]], axis=1) sol = solve_upper_triangular( LAmm.T, solve_lower_triangular(LAmm, rhs)) iBmm = sol[:, :-1] beta_sp = sol[:, -1] log_det_K_sp = tt.sum(tt.log(Gamma)) log_det_K_sp += 2*tt.sum(tt.log(tt.diag(Amm))) loss_sp = Yi.dot(Yi) - Kmn_dotYi.dot(beta_sp) loss_sp += log_det_K_sp + N*np.log(2*np.pi) loss_sp *= 0.5 return loss_sp, iKmm, Lmm, Amm, iBmm, beta_sp
def _create_theano_likelihood_graph_voltage(data, X, ind_X, n_X, n_inducing_X, approx='FITC'):#<-----New rho = tt.dvector('rho') ker_sigma = tt.dscalar('ker_sigma') sigma = tt.dscalar('sigma') time_V = theano.tensor.as_tensor_variable(X) inducing_time_V = theano.tensor.as_tensor_variable(ind_X) y = theano.tensor.as_tensor_variable(data) current = tt.dvector('current') cov_func = RbfKernel(rho, ker_sigma) sigma2 = tt.square(sigma) Kuu = cov_func(inducing_time_V) Kuf = cov_func(inducing_time_V, time_V) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if approx == 'FITC': Kffd = cov_func(time_V, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 trace = 0.0 elif approx == 'VFE': Lamd = tt.ones_like(Qffd) * sigma2 trace = ((1.0 / (2.0 * sigma2)) * (tt.sum(cov_func(time_V, diag=True)) - tt.sum(tt.sum(A * A, 0)))) else: # DTC Lamd = tt.ones_like(Qffd) * sigma2 trace = 0.0 A_l = A / Lamd L_B = cholesky(tt.eye(n_inducing_X) + tt.dot(A_l, tt.transpose(A))) r = y - current r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) constant = 0.5 * n_X * tt.log(2.0 * np.pi) logdet = 0.5 * tt.sum(tt.log(Lamd)) + tt.sum(tt.log(tt.diag(L_B))) quadratic = 0.5 * (tt.dot(r, r_l) - tt.dot(c, c)) ll = -1.0 * (constant + logdet + quadratic + trace) return theano.function([current,rho,ker_sigma,sigma],ll,on_unused_input='ignore')
def predict(self, mx, Sx=None, **kwargs): # by default, sample internal params (e.g. dropout masks) # at every evaluation kwargs['iid_per_eval'] = kwargs.get('iid_per_eval', True) kwargs['whiten_inputs'] = kwargs.get('whiten_inputs', True) kwargs['whiten_outputs'] = kwargs.get('whiten_outputs', True) kwargs['deterministic'] = kwargs.get('deterministic', False) if Sx is not None: # generate random samples from input (assuming gaussian # distributed inputs) # standard uniform samples (one sample per network sample) z_std = self.m_rng.normal((self.n_samples, self.D)) # scale and center particles Lx = tt.slinalg.cholesky(Sx) x = mx + z_std.dot(Lx.T) else: x = mx[None, :] if mx.ndim == 1 else mx # we are going to apply the saturation function # after whitening the outputs return_samples = kwargs.get('return_samples', True) kwargs['return_samples'] = True y, sn = super(NNPolicy, self).predict(x, None, **kwargs) if callable(self.sat_func): y = self.sat_func(y) if return_samples: return y, sn else: n = tt.cast(y.shape[0], dtype=theano.config.floatX) # empirical mean M = y.mean(axis=0) # empirical covariance deltay = y - M S = deltay.T.dot(deltay) / (n - 1) # noise S += tt.diag((sn**2).mean(axis=0)) # empirical input output covariance if Sx is not None: deltax = x - x.mean(0) C = deltax.T.dot(deltay) / (n - 1) else: C = tt.zeros((self.D, self.E)) return [M, S, C]
def get_model(self, lengthscale_trf, lengthscale_p_trf, sn_trf, sf_trf, MU_S, SIGMA_S_trf, MU, SIGMA_trf, U, b, X, y, MEAN_MAP, Q, D, D_cum_sum, layers, order, non_rec, N, M): X_inputs, SIGMA_inputs = self.update(layers, order, MU, SIGMA_trf, X, Q, D, D_cum_sum, N, non_rec) LL = 0 for i in range(0, layers + 1): EEPhi, EEPhiTPhi = self.get_EPhi( lengthscale_trf[D_cum_sum[i]:D_cum_sum[i + 1]], lengthscale_p_trf[D_cum_sum[i]:D_cum_sum[i + 1]], sf_trf[i], MU_S[:, D_cum_sum[i]:D_cum_sum[i + 1]], SIGMA_S_trf[:, D_cum_sum[i]:D_cum_sum[i + 1]], X_inputs[:, D_cum_sum[i]:D_cum_sum[i + 1]], SIGMA_inputs[:, D_cum_sum[i]:D_cum_sum[i + 1]], U[:, D_cum_sum[i]:D_cum_sum[i + 1]], b[:, i], N, M, i, D[i], order, non_rec) if i == layers: z = y[order:] SIGMA_trf_LL = 0 else: if layers > 1: z = MU[order:, i] - X.dot(MEAN_MAP) SIGMA_trf_LL = SIGMA_trf[order:, i] else: z = MU[order:] - X.dot(MEAN_MAP) SIGMA_trf_LL = SIGMA_trf[order:] zT_EEPhi = z.T.dot(EEPhi) opt_A_mean, cholSigInv = self.get_opt_A(sn_trf[i], EEPhiTPhi, zT_EEPhi) LL = LL - 0.5 * (N - M) * T.log(sn_trf[i]) - 0.5 * N * np.log( 2 * np.pi) - 0.5 * T.sum( 2 * T.log(T.diag(cholSigInv))) - 0.5 * T.sum( SIGMA_trf_LL) / sn_trf[i] - 0.5 * T.sum( z**2) / sn_trf[i] + 0.5 * T.sum( opt_A_mean.T * zT_EEPhi) / sn_trf[i] KL_S = 0.5 * (SIGMA_S_trf + MU_S**2 - T.log(SIGMA_S_trf) - 1).sum() KL_X = -0.5 * (T.log(2 * np.pi * SIGMA_trf) + 1).sum() + 0.5 * layers * order * (np.log( 2 * np.pi)) + 0.5 * (SIGMA_trf[1:order, ] + MU[1:order, ]**2).sum() KL = KL_S + KL_X return LL, KL
def get_model(self, lengthscale_trf, lengthscale_p_trf, sn_trf, sf_trf, S, MU, SIGMA_trf, U, b, X, Q, D, N, M): EPhi, EPhiTPhi = self.get_EPhi(lengthscale_trf, lengthscale_p_trf, sf_trf, S, MU, SIGMA_trf, U, b, N, M) XT_EPhi = X.T.dot(EPhi) opt_A_mean, cholSigInv = self.get_opt_A(sn_trf, EPhiTPhi, XT_EPhi) LL = -0.5 * (D * ( (N - M) * T.log(sn_trf) + N * np.log(2 * np.pi) + T.sum(2 * T.log(T.diag(cholSigInv)))) + T.sum(X**2) / sn_trf - T.sum(opt_A_mean.T * XT_EPhi) / sn_trf) KL_X = -0.5 * (T.log(2 * np.pi * SIGMA_trf) + 1).sum() + 0.5 * (np.log( 2 * np.pi)) + 0.5 * (SIGMA_trf + MU**2).sum() return LL, KL_X
def bpr_max_reg(self, pred_mat, y, y_pos): loss = 0.5 softmax_scores = self.softmax_neg(pred_mat.T).T loss_part = -T.log( T.sum(T.nnet.sigmoid(T.diag(pred_mat.T) - pred_mat) * softmax_scores, axis=0) + 1e-24) reg_part = loss * T.sum((pred_mat**2) * softmax_scores, axis=0) reg_part2 = (-self.regularization * (self.S[y_pos]**2).sum(axis=1) - self.regularization * (self.I[y_pos]**2).sum(axis=1) - self.regularization * (self.I1[y_pos]**2).sum(axis=1) - self.regularization * (self.I2[y_pos]**2).sum(axis=1) - self.regularization * (self.BI[y_pos]**2) - self.regularization * (self.BS[y_pos]**2) - self.regularization * (self.H[y_pos]**2)) return T.cast(T.mean(loss_part + reg_part - reg_part2), self.floatX)
def MMD_kenel_ZX(self,Xlabel,Kmn,Weight): Dn=self.Zlabel_T.shape[1] DDX=T.tile(Xlabel.T, (Dn, 1,1)) DDZ=T.tile(self.Zlabel_T.T, (Dn, 1,1)) tttt=DDZ[:,:,:,None]*DDX.transpose((1,0,2))[:,:,None,:]#10*10*N_Z*Nとか Hh=T.sum(T.sum(tttt*Kmn[None,None,:,:],-1),-1) Hh=Hh*Weight GH=T.tile(T.diag(Hh),(Dn,1)) new=T.exp(-(GH.T+GH-2*Hh)/(2*self.gamma**2))#ここまででD×DのMMD距離になった。次はRBFカーネルにかける KK=tttt*new[:,:,None,None] #KK1=T.where(T.eq(KK,0),1,KK)#これはZ用。Xは一つしか0じゃない数はないが、Zが複数ある。全てを重みつきでかけたいが、0があると0になっちゃうので1に変換する KK2=T.sum(T.sum(KK,0),0) Kmmd_rbf=KK2*Kmn#RBFカーネルにかける return Kmmd_rbf
def low_rank_matrix_approximation_theano(A, k, norm_ord, minstepsize=1e-9): manifold, solver = _bootstrap_problem(A, k, minstepsize) U, S, V = [T.matrix(sym) for sym in ['U', 'S', 'V']] if norm_ord == 'fro': cost = T.sum((U.dot(S).dot(V) - A)**2) elif norm_ord == 'spectral': cost = (U.dot(S).dot(V) - A).norm(2) elif norm_ord == 'abs': cost = (U.dot(S).dot(V) - A).norm(1) else: mat = U.dot(S).dot(V) - A cost = T.diag(mat.T.dot(mat)).norm( L=norm_ord ) #T.sum(T.nlinalg.svd(U.dot(S).dot(V) - A, full_matrices=False)[1]) problem = Problem(man=manifold, theano_cost=cost, theano_arg=[U, S, V]) return solver.solve(problem)
def gSin(m, v, i=None, e=None): D = m.shape[0] if i is None: i = tt.arange(D) if e is None: e = tt.ones((D, )) elif e.__class__ is list: e = tt.as_tensor_variable(np.array(e)).flatten() elif e.__class__ is np.array: e = tt.as_tensor_variable(e).flatten() # compute the output mean mi = m[i] vi = v[i, :][:, i] vii = v[i, i] exp_vii_h = tt.exp(-vii / 2) M = exp_vii_h * tt.sin(mi) # output covariance vii_c = vii.dimshuffle(0, 'x') vii_r = vii.dimshuffle('x', 0) lq = -0.5 * (vii_c + vii_r) q = tt.exp(lq) exp_lq_p_vi = tt.exp(lq + vi) exp_lq_m_vi = tt.exp(lq - vi) mi_c = mi.dimshuffle(0, 'x') mi_r = mi.dimshuffle('x', 0) U1 = (exp_lq_p_vi - q) * (tt.cos(mi_c - mi_r)) U2 = (exp_lq_m_vi - q) * (tt.cos(mi_c + mi_r)) V = 0.5 * (U1 - U2) # inv input covariance dot input output covariance C = tt.diag(exp_vii_h * tt.cos(mi)) # account for the effect of scaling the output M = e * M V = tt.outer(e, e) * V C = e * C retvars = [M, V, C] return retvars