def get_opt_A(self, sn_trf, EPhiTPhi, XT_EPhi, K_MM): cholSigInv = sT.cholesky(EPhiTPhi + sn_trf * T.identity_like(K_MM)) cholK_MM = sT.cholesky(K_MM + 1e-6 * T.identity_like(K_MM)) invCholSigInv = sT.matrix_inverse(cholSigInv) invCholK_MM = sT.matrix_inverse(cholK_MM) InvSig = invCholSigInv.T.dot(invCholSigInv) InvK_MM = invCholK_MM.T.dot(invCholK_MM) Sig_EPhiT_X = InvSig.dot(XT_EPhi.T) return Sig_EPhiT_X, cholSigInv, cholK_MM, InvK_MM
def __init__(self, mu, sigma, random_state=None): super(MultivariateNormal, self).__init__(mu=mu, sigma=sigma) # XXX: The SDP-ness of sigma should be check upon changes # ndim self.ndim_ = self.mu.shape[0] self.make_(self.ndim_, "ndim_func_", args=[]) # pdf L = linalg.cholesky(self.sigma) sigma_det = linalg.det(self.sigma) # XXX: compute from L instead sigma_inv = linalg.matrix_inverse(self.sigma) # XXX: idem self.pdf_ = ((1. / T.sqrt( (2. * np.pi)**self.ndim_ * T.abs_(sigma_det))) * T.exp(-0.5 * T.sum(T.mul( T.dot(self.X - self.mu, sigma_inv), self.X - self.mu), axis=1))).ravel() self.make_(self.pdf_, "pdf") # -log pdf self.nll_ = -T.log(self.pdf_) # XXX: for sure this can be better self.make_(self.nll_, "nll") # self.rvs_ self.make_(T.dot(L, self.X.T).T + self.mu, "rvs_func_")
def get_opt_A(self, sn_trf, EPhiTPhi, XT_EPhi): SigInv = EPhiTPhi + (sn_trf + 1e-6) * T.identity_like(EPhiTPhi) cholSigInv = sT.cholesky(SigInv) invCholSigInv = sT.matrix_inverse(cholSigInv) InvSig = invCholSigInv.T.dot(invCholSigInv) Sig_EPhiT_X = InvSig.dot(XT_EPhi.T) return Sig_EPhiT_X, cholSigInv
def __init__(self, mu, sigma, random_state=None): super(MultivariateNormal, self).__init__(mu=mu, sigma=sigma, random_state=random_state, optimizer=None) # XXX: The SDP-ness of sigma should be check upon changes # ndim self.ndim_ = self.mu.shape[0] self.make_(self.ndim_, "ndim_func_", args=[]) # pdf L = linalg.cholesky(self.sigma) sigma_det = linalg.det(self.sigma) # XXX: compute from L instead sigma_inv = linalg.matrix_inverse(self.sigma) # XXX: idem self.pdf_ = ( (1. / T.sqrt((2. * np.pi) ** self.ndim_ * T.abs_(sigma_det))) * T.exp(-0.5 * T.sum(T.mul(T.dot(self.X - self.mu, sigma_inv), self.X - self.mu), axis=1))).ravel() self.make_(self.pdf_, "pdf") # -log pdf self.nnlf_ = -T.log(self.pdf_) # XXX: for sure this can be better self.make_(self.nnlf_, "nnlf") # self.rvs_ self.make_(T.dot(L, self.X.T).T + self.mu, "rvs_func_")
def get_opt_A(self, tau, EPhiTPhi, YT_EPhi): SigInv = EPhiTPhi + (tau**-1 + 1e-4) * T.identity_like(EPhiTPhi) cholTauSigInv = tau**0.5 * sT.cholesky(SigInv) invCholTauSigInv = sT.matrix_inverse(cholTauSigInv) tauInvSig = invCholTauSigInv.T.dot(invCholTauSigInv) Sig_EPhiT_Y = tau * tauInvSig.dot(YT_EPhi.T) return Sig_EPhiT_Y, tauInvSig, cholTauSigInv
def get_model(self, X, Y, x_test): ''' Gaussian Process Regression model. Reference: C.E. Rasmussen, "Gaussian Process for Machine Learning", MIT Press 2006 Args: X: tensor matrix, training data Y: tensor matrix, training target x_test: tensor matrix, testing data Returns: K: prior cov matrix Ks: prior joint cov matrix Kss: prior cov matrix for testing data Posterior Distribution: alpha: alpha = inv(K)*(mu-m) sW: vector containing diagonal of sqrt(W) L: L = chol(sW*K*sW+eye(n)) y_test_mu: predictive mean y_test_var: predictive variance fs2: predictive latent variance Note: the cov matrix inverse is computed through Cholesky factorization https://makarandtapaswi.wordpress.com/2011/07/08/cholesky-decomposition-for-matrix-inversion/ ''' # Compute GP prior distribution: mean and covariance matrices (eq 2.13, 2.14) K = self.covFunc(X, X, 'K') # pior cov #m = T.mean(Y)*T.ones_like(Y) # pior mean m = self.mean * T.ones_like(Y) # pior mean # Compute GP joint prior distribution between training and test (eq 2.18) Ks = self.covFunc(X, x_test, 'Ks') # Pay attention!! here is the self test cov matrix. Kss = self.covFunc(x_test, x_test, 'Kss', mode='self_test') # Compute posterior distribution with noise: L,alpha,sW,and log_likelihood. sn2 = T.exp(2 * self.sigma_n) # noise variance of likGauss L = sT.cholesky(K / sn2 + T.identity_like(K)) sl = sn2 alpha = T.dot(sT.matrix_inverse(L.T), T.dot(sT.matrix_inverse(L), (Y - m))) / sl sW = T.ones_like(T.sum(K, axis=1)).reshape( (K.shape[0], 1)) / T.sqrt(sl) log_likelihood = T.sum(-0.5 * (T.dot((Y - m).T, alpha)) - T.sum(T.log(T.diag(L))) - X.shape[0] / 2 * T.log(2. * np.pi * sl)) # Compute predictive distribution using the computed posterior distribution. fmu = m + T.dot(Ks.T, alpha) # Prediction Mu fs|f, eq 2.25 V = T.dot(sT.matrix_inverse(L), T.extra_ops.repeat(sW, x_test.shape[0], axis=1) * Ks) fs2 = Kss - (T.sum(V * V, axis=0)).reshape( (1, V.shape[1])).T # Predication Sigma, eq 2.26 fs2 = T.maximum(fs2, 0) # remove negative variance noise #fs2 = T.sum(fs2,axis=1) # in case x has multiple dimensions y_test_mu = fmu y_test_var = fs2 + sn2 return K, Ks, Kss, y_test_mu, y_test_var, log_likelihood, L, alpha, V, fs2, sW
def build_theano_models(self, algo, algo_params): epsilon = 1e-6 kl = lambda mu, sig: sig+mu**2-TT.log(sig) X, y = TT.dmatrices('X', 'y') params = TT.dvector('params') a, b, c, l_F, F, l_FC, FC = self.unpack_params(params) sig2_n, sig_f = TT.exp(2*a), TT.exp(b) l_FF = TT.dot(X, l_F)+l_FC FF = TT.concatenate((l_FF, TT.dot(X, F)+FC), 1) Phi = TT.concatenate((TT.cos(FF), TT.sin(FF)), 1) Phi = sig_f*TT.sqrt(2./self.M)*Phi noise = TT.log(1+TT.exp(c)) PhiTPhi = TT.dot(Phi.T, Phi) A = PhiTPhi+(sig2_n+epsilon)*TT.identity_like(PhiTPhi) L = Tlin.cholesky(A) Li = Tlin.matrix_inverse(L) PhiTy = Phi.T.dot(y) beta = TT.dot(Li, PhiTy) alpha = TT.dot(Li.T, beta) mu_f = TT.dot(Phi, alpha) var_f = (TT.dot(Phi, Li.T)**2).sum(1)[:, None] dsp = noise*(var_f+1) mu_l = TT.sum(TT.mean(l_F, axis=1)) sig_l = TT.sum(TT.std(l_F, axis=1)) mu_w = TT.sum(TT.mean(F, axis=1)) sig_w = TT.sum(TT.std(F, axis=1)) hermgauss = np.polynomial.hermite.hermgauss(30) herm_x = Ts(hermgauss[0])[None, None, :] herm_w = Ts(hermgauss[1]/np.sqrt(np.pi))[None, None, :] herm_f = TT.sqrt(2*var_f[:, :, None])*herm_x+mu_f[:, :, None] nlk = (0.5*herm_f**2.-y[:, :, None]*herm_f)/dsp[:, :, None]+0.5*( TT.log(2*np.pi*dsp[:, :, None])+y[:, :, None]**2/dsp[:, :, None]) enll = herm_w*nlk nlml = 2*TT.log(TT.diagonal(L)).sum()+2*enll.sum()+1./sig2_n*( (y**2).sum()-(beta**2).sum())+2*(X.shape[0]-self.M)*a penelty = (kl(mu_w, sig_w)*self.M+kl(mu_l, sig_l)*self.S)/(self.S+self.M) cost = (nlml+penelty)/X.shape[0] grads = TT.grad(cost, params) updates = getattr(OPT, algo)(self.params, grads, **algo_params) updates = getattr(OPT, 'apply_nesterov_momentum')(updates, momentum=0.9) train_inputs = [X, y] train_outputs = [cost, alpha, Li] self.train_func = Tf(train_inputs, train_outputs, givens=[(params, self.params)]) self.train_iter_func = Tf(train_inputs, train_outputs, givens=[(params, self.params)], updates=updates) Xs, Li, alpha = TT.dmatrices('Xs', 'Li', 'alpha') l_FFs = TT.dot(Xs, l_F)+l_FC FFs = TT.concatenate((l_FFs, TT.dot(Xs, F)+FC), 1) Phis = TT.concatenate((TT.cos(FFs), TT.sin(FFs)), 1) Phis = sig_f*TT.sqrt(2./self.M)*Phis mu_pred = TT.dot(Phis, alpha) std_pred = (noise*(1+(TT.dot(Phis, Li.T)**2).sum(1)))**0.5 pred_inputs = [Xs, alpha, Li] pred_outputs = [mu_pred, std_pred] self.pred_func = Tf(pred_inputs, pred_outputs, givens=[(params, self.params)])
def get_model(self,X, Y, x_test): ''' Gaussian Process Regression model. Reference: C.E. Rasmussen, "Gaussian Process for Machine Learning", MIT Press 2006 Args: X: tensor matrix, training data Y: tensor matrix, training target x_test: tensor matrix, testing data Returns: K: prior cov matrix Ks: prior joint cov matrix Kss: prior cov matrix for testing data Posterior Distribution: alpha: alpha = inv(K)*(mu-m) sW: vector containing diagonal of sqrt(W) L: L = chol(sW*K*sW+eye(n)) y_test_mu: predictive mean y_test_var: predictive variance fs2: predictive latent variance Note: the cov matrix inverse is computed through Cholesky factorization https://makarandtapaswi.wordpress.com/2011/07/08/cholesky-decomposition-for-matrix-inversion/ ''' # Compute GP prior distribution: mean and covariance matrices (eq 2.13, 2.14) K = self.covFunc(X,X,'K') # pior cov #m = T.mean(Y)*T.ones_like(Y) # pior mean m = self.mean*T.ones_like(Y) # pior mean # Compute GP joint prior distribution between training and test (eq 2.18) Ks = self.covFunc(X,x_test,'Ks') # Pay attention!! here is the self test cov matrix. Kss = self.covFunc(x_test,x_test,'Kss',mode='self_test') # Compute posterior distribution with noise: L,alpha,sW,and log_likelihood. sn2 = T.exp(2*self.sigma_n) # noise variance of likGauss L = sT.cholesky(K/sn2 + T.identity_like(K)) sl = sn2 alpha = T.dot(sT.matrix_inverse(L.T), T.dot(sT.matrix_inverse(L), (Y-m)) ) / sl sW = T.ones_like(T.sum(K,axis=1)).reshape((K.shape[0],1)) / T.sqrt(sl) log_likelihood = T.sum(-0.5 * (T.dot((Y-m).T, alpha)) - T.sum(T.log(T.diag(L))) - X.shape[0] / 2 * T.log(2.*np.pi*sl)) # Compute predictive distribution using the computed posterior distribution. fmu = m + T.dot(Ks.T, alpha) # Prediction Mu fs|f, eq 2.25 V = T.dot(sT.matrix_inverse(L),T.extra_ops.repeat(sW,x_test.shape[0],axis=1)*Ks) fs2 = Kss - (T.sum(V*V,axis=0)).reshape((1,V.shape[1])).T # Predication Sigma, eq 2.26 fs2 = T.maximum(fs2,0) # remove negative variance noise #fs2 = T.sum(fs2,axis=1) # in case x has multiple dimensions y_test_mu = fmu y_test_var = fs2 + sn2 return K, Ks, Kss, y_test_mu, y_test_var, log_likelihood, L, alpha,V, fs2,sW
def KLD_U(self, m, L_scaled, Kmm, KmmInv): #N(u|m,S)とN(u|0,Kmm) S=L*L.T(コレスキー分解したのを突っ込みましょう) M = m.shape[0] D = m.shape[1] #KmmInv = sT.matrix_inverse(Kmm) KL_U = D * (T.sum(KmmInv.T * L_scaled.dot(L_scaled.T)) - M - 2.0 * T.sum(T.log(T.diag(L_scaled))) + 2.0 * T.sum(T.log(T.diag(sT.cholesky(Kmm))))) KL_U += T.sum(T.dot(KmmInv, m) * m) return 0.5 * KL_U
def compile_theano_funcs(self, opt_algo, opt_params, dropout): self.compiled_funcs = {} # Compile Train & Optimization Function eps = 1e-5 params = Tt.vector('params') X, Y = Tt.matrix('X'), Tt.matrix('Y') sig2, F, M, V = self.feature_maps(X, params) EPhi = F[-1] EPhiPhiT = Tt.dot(EPhi, Tt.transpose(EPhi)) A = EPhiPhiT + (sig2 + eps) * Tt.identity_like(EPhiPhiT) L = Tlin.cholesky(A) Linv = Tlin.matrix_inverse(L) YPhiT = Tt.dot(Y, Tt.transpose(EPhi)) beta = Tt.dot(YPhiT, Tt.transpose(Linv)) alpha = Tt.dot(beta, Linv) mu_F = Tt.dot(alpha, EPhi) GOF = .5 / sig2 * Tt.sum(Tt.sum(Tt.dot(Y, (Y - mu_F).T))) REG = Tt.sum(Tt.log( Tt.diagonal(L))) + (self.N - self.D[-2]) / 2 * Tt.log(sig2) REG *= self.D[-1] KL = 0 for h in range(self.H): KL += Tt.sum(Tt.sum(M[h]**2) + Tt.sum(V[h] - Tt.log(V[h] + eps))) KL -= self.D[h + 1] * self.D[h + 2] // 2 obj = debug('obj', GOF + REG + KL) self.compiled_funcs['debug'] = Tf([X, Y], [obj], givens=[(params, self.params)]) grads = Tt.grad(obj, params) updates = {self.params: grads} updates = getattr(Optimizer, opt_algo)(updates, **opt_params) updates = getattr(Optimizer, 'nesterov')(updates, momentum=0.9) train_inputs = [X, Y] train_outputs = [obj, alpha, Linv, mu_F] self.compiled_funcs['opt'] = Tf(train_inputs, train_outputs, givens=[(params, self.params)], updates=updates) self.compiled_funcs['train'] = Tf(train_inputs, train_outputs, givens=[(params, self.params)]) # Compile Predict Function Linv, alpha = Tt.matrix('Linv'), Tt.matrix('alpha') Xs = Tt.matrix('Xs') sig2, Fs, _, _ = self.feature_maps(Xs, params) EPhis = Fs[-1] mu_Fs = Tt.dot(alpha, EPhis) std_Fs = ((sig2 * (1 + (Tt.dot(Linv, EPhis)**2).sum(0)))**0.5)[:, None] pred_inputs = [Xs, alpha, Linv] pred_outputs = [mu_Fs, std_Fs] self.compiled_funcs['pred'] = Tf(pred_inputs, pred_outputs, givens=[(params, self.params)])
def _build_graph(self): """Sets up the gaussian process's tensor variables.""" X = self.X Y = self.Y x = self.x reg = self.reg if self._normalize_y: Y_mean = T.mean(Y, axis=0) Y_variance = T.std(Y, axis=0) Y = (Y - Y_mean) / Y_variance # Kernel functions. K_ss = self._kernel(x, x) K_s = self._kernel(x, X) K = self._kernel(X, X) + self._sigma_n**2 * T.eye(X.shape[0]) # Guarantee positive definite. K = 0.5 * (K + K.T) + reg * T.eye(K.shape[0]) # Mean and variance functions. K_inv = sT.matrix_inverse(K) mu = T.dot(K_s, T.dot(K_inv, self.Y)) # Non-normalized Y for scale. var = K_ss - T.dot(K_s, T.dot(K_inv, K_s.T)) # Compute the standard deviation. L = sT.cholesky(K) L_k = T.slinalg.solve_lower_triangular(L, K_s.T) std = T.sqrt(T.diag(K_ss) - T.sum(L_k**2, axis=0)).reshape((-1, 1)) # Compute the log likelihood. log_likelihood_dims = -0.5 * T.dot(Y.T, T.dot(K_inv, Y)).sum(axis=0) log_likelihood_dims -= T.log(T.diag(L)).sum() log_likelihood_dims -= L.shape[0] / 2 * T.log(2 * np.pi) log_likelihood = log_likelihood_dims.sum(axis=-1) self._mu = mu self._var = var self._std = std self._log_likelihood = log_likelihood
def __init__(self, mu, sigma): """Constructor. Parameters ---------- * `mu` [1d array]: The means. * `sigma` [2d array]: The covariance matrix. """ super(MultivariateNormal, self).__init__(mu=mu, sigma=sigma) # XXX: The SDP-ness of sigma should be check upon changes # ndim self.ndim_ = self.mu.shape[0] self._make(self.ndim_, "ndim_func_", args=[]) # pdf L = linalg.cholesky(self.sigma) sigma_det = linalg.det(self.sigma) # XXX: compute from L instead sigma_inv = linalg.matrix_inverse(self.sigma) # XXX: idem self.pdf_ = ( (1. / T.sqrt((2. * np.pi) ** self.ndim_ * T.abs_(sigma_det))) * T.exp(-0.5 * T.sum(T.mul(T.dot(self.X - self.mu, sigma_inv), self.X - self.mu), axis=1))).ravel() self._make(self.pdf_, "pdf") # -log pdf self.nll_ = -T.log(self.pdf_) # XXX: for sure this can be better self._make(self.nll_, "nll") # self.rvs_ self._make(T.dot(L, self.X.T).T + self.mu, "rvs_func_")
def KLD_U(self, m, L_scaled, Kmm,KmmInv):#N(u|m,S)とN(u|0,Kmm) S=L*L.T(コレスキー分解したのを突っ込みましょう) M = m.shape[0] D = m.shape[1] #KmmInv = sT.matrix_inverse(Kmm) KL_U = D * (T.sum(KmmInv.T * L_scaled.dot(L_scaled.T)) - M - 2.0*T.sum(T.log(T.diag(L_scaled))) + 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm))))) KL_U += T.sum(T.dot(KmmInv,m)*m) return 0.5*KL_U
def __init__(self, params, sx2=1, linear_model=False, samples=20, use_hat=False): ker, self.samples, self.params, self.KmmInv = kernel( ), samples, params, {} self.use_hat = use_hat model_file_name = 'model' + ('_hat' if use_hat else '') + ( '_linear' if linear_model else '') + '.save' try: print 'Trying to load model...' with open(model_file_name, 'rb') as file_handle: obj = cPickle.load(file_handle) self.f, self.g, self.f_Kmm, self.f_KmmInv, self.dKmm_d = obj self.update_KmmInv_cache() print 'Loaded!' return except: print 'Failed. Creating a new model...' Y, Z, m, ls, mu, lL, eps_MK, eps_NQ, eps_NK, KmmInv = T.dmatrices( 'Y', 'Z', 'm', 'ls', 'mu', 'lL', 'eps_MK', 'eps_NQ', 'eps_NK', 'KmmInv') lhyp = T.dvector('lhyp') (M, K), N, Q = mu.shape, m.shape[0], Z.shape[1] s, sl2, sf2, l = T.exp(ls), T.exp(lhyp[0]), T.exp(lhyp[1]), T.exp( lhyp[2:2 + Q]) L = T.tril(lL - T.diag(T.diag(lL)) + T.diag(T.exp(T.diag(lL)))) print 'Setting up cache...' Kmm = ker.RBF(sf2, l, Z) if not linear_model else ker.LIN(sl2, Z) KmmInv_cache = sT.matrix_inverse(Kmm) self.f_Kmm = theano.function([Z, lhyp], Kmm, name='Kmm') self.f_KmmInv = theano.function([Z, lhyp], KmmInv_cache, name='KmmInv_cache') self.update_KmmInv_cache() self.dKmm_d = { 'Z': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), Z), name='dKmm_dZ'), 'lhyp': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), lhyp), name='dKmm_dlhyp') } print 'Setting up model...' if not self.use_hat: mu_scaled, L_scaled = sf2**0.5 * mu, sf2**0.5 * L X = m + s * eps_NQ U = mu_scaled + L_scaled.dot(eps_MK) Kmn = ker.RBF(sf2, l, Z, X) if not linear_model else ker.LIN( sl2, Z, X) Knn = ker.RBFnn(sf2, l, X) if not linear_model else ker.LINnn( sl2, X) A = KmmInv.dot(Kmn) B = Knn - T.sum(Kmn * KmmInv.dot(Kmn), 0) F = A.T.dot(U) + T.maximum(B, 1e-16)[:, None]**0.5 * eps_NK F = T.concatenate((T.zeros((N, 1)), F), axis=1) S = T.nnet.softmax(F) LS = T.sum(T.log(T.maximum(T.sum(Y * S, 1), 1e-16))) if not linear_model: KL_U = -0.5 * (T.sum(KmmInv.T * T.sum( mu_scaled[:, None, :] * mu_scaled[None, :, :], 2)) + K * (T.sum(KmmInv.T * L_scaled.dot(L_scaled.T)) - M - 2.0 * T.sum(T.log(T.diag(L_scaled))) + 2.0 * T.sum(T.log(T.diag(sT.cholesky(Kmm)))))) else: KL_U = 0 #KL_U = -0.5 * T.sum(T.sum(mu_scaled * KmmInv.dot(mu_scaled), 0) + T.sum(KmmInv * L_scaled.dot(L_scaled.T)) - M # - 2.0*T.sum(T.log(T.diag(L_scaled))) + 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm))))) if not linear_model else 0 else: # mu_scaled, L_scaled = mu / sf2**0.5, L / sf2**0.5 mu_scaled, L_scaled = mu / sf2, L / sf2 X = m + s * eps_NQ U = mu_scaled + L_scaled.dot(eps_MK) Kmn = ker.RBF(sf2, l, Z, X) if not linear_model else ker.LIN( sl2, Z, X) Knn = ker.RBFnn(sf2, l, X) if not linear_model else ker.LINnn( sl2, X) B = Knn - T.sum(Kmn * KmmInv.dot(Kmn), 0) F = Kmn.T.dot(U) + T.maximum(B, 1e-16)[:, None]**0.5 * eps_NK F = T.concatenate((T.zeros((N, 1)), F), axis=1) S = T.nnet.softmax(F) LS = T.sum(T.log(T.maximum(T.sum(Y * S, 1), 1e-16))) if not linear_model: KL_U = -0.5 * (T.sum(Kmm.T * T.sum( mu_scaled[:, None, :] * mu_scaled[None, :, :], 2)) + K * (T.sum(Kmm.T * L_scaled.dot(L_scaled.T)) - M - 2.0 * T.sum(T.log(T.diag(L_scaled))) - 2.0 * T.sum(T.log(T.diag(sT.cholesky(Kmm)))))) else: KL_U = 0 KL_X_all = -0.5 * T.sum( (m**2.0 + s**2.0) / sx2 - 1.0 - 2.0 * ls + T.log(sx2), 1) KL_X = T.sum(KL_X_all) print 'Compiling...' inputs = { 'Y': Y, 'Z': Z, 'm': m, 'ls': ls, 'mu': mu, 'lL': lL, 'lhyp': lhyp, 'KmmInv': KmmInv, 'eps_MK': eps_MK, 'eps_NQ': eps_NQ, 'eps_NK': eps_NK } z = 0.0 * sum([ T.sum(v) for v in inputs.values() ]) # solve a bug with derivative wrt inputs not in the graph f = zip(['X', 'U', 'S', 'LS', 'KL_U', 'KL_X', 'KL_X_all'], [X, U, S, LS, KL_U, KL_X, KL_X_all]) self.f = { n: theano.function(inputs.values(), f + z, name=n, on_unused_input='ignore') for n, f in f } g = zip(['LS', 'KL_U', 'KL_X'], [LS, KL_U, KL_X]) wrt = { 'Z': Z, 'm': m, 'ls': ls, 'mu': mu, 'lL': lL, 'lhyp': lhyp, 'KmmInv': KmmInv } self.g = { vn: { gn: theano.function(inputs.values(), T.grad(gv + z, vv), name='d' + gn + '_d' + vn, on_unused_input='ignore') for gn, gv in g } for vn, vv in wrt.iteritems() } with open(model_file_name, 'wb') as file_handle: print 'Saving model...' sys.setrecursionlimit(2000) cPickle.dump( [self.f, self.g, self.f_Kmm, self.f_KmmInv, self.dKmm_d], file_handle, protocol=cPickle.HIGHEST_PROTOCOL)
def __init__(self, params, sx2 = 1, linear_model = False, samples = 20, use_hat = False): ker, self.samples, self.params, self.KmmInv = kernel(), samples, params, {} self.use_hat = use_hat model_file_name = 'model' + ('_hat' if use_hat else '') + ('_linear' if linear_model else '') + '.save' try: print 'Trying to load model...' with open(model_file_name, 'rb') as file_handle: obj = cPickle.load(file_handle) self.f, self.g, self.f_Kmm, self.f_KmmInv, self.dKmm_d = obj self.update_KmmInv_cache() print 'Loaded!' return except: print 'Failed. Creating a new model...' Y, Z, m, ls, mu, lL, eps_MK, eps_NQ, eps_NK, KmmInv = T.dmatrices('Y', 'Z', 'm', 'ls', 'mu', 'lL', 'eps_MK', 'eps_NQ', 'eps_NK', 'KmmInv') lhyp = T.dvector('lhyp') (M, K), N, Q = mu.shape, m.shape[0], Z.shape[1] s, sl2, sf2, l = T.exp(ls), T.exp(lhyp[0]), T.exp(lhyp[1]), T.exp(lhyp[2:2+Q]) L = T.tril(lL - T.diag(T.diag(lL)) + T.diag(T.exp(T.diag(lL)))) print 'Setting up cache...' Kmm = ker.RBF(sf2, l, Z) if not linear_model else ker.LIN(sl2, Z) KmmInv_cache = sT.matrix_inverse(Kmm) self.f_Kmm = theano.function([Z, lhyp], Kmm, name='Kmm') self.f_KmmInv = theano.function([Z, lhyp], KmmInv_cache, name='KmmInv_cache') self.update_KmmInv_cache() self.dKmm_d = {'Z': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), Z), name='dKmm_dZ'), 'lhyp': theano.function([Z, lhyp], T.jacobian(Kmm.flatten(), lhyp), name='dKmm_dlhyp')} print 'Setting up model...' if not self.use_hat: mu_scaled, L_scaled = sf2**0.5 * mu, sf2**0.5 * L X = m + s * eps_NQ U = mu_scaled + L_scaled.dot(eps_MK) Kmn = ker.RBF(sf2, l, Z, X) if not linear_model else ker.LIN(sl2, Z, X) Knn = ker.RBFnn(sf2, l, X) if not linear_model else ker.LINnn(sl2, X) A = KmmInv.dot(Kmn) B = Knn - T.sum(Kmn * KmmInv.dot(Kmn), 0) F = A.T.dot(U) + T.maximum(B, 1e-16)[:,None]**0.5 * eps_NK F = T.concatenate((T.zeros((N,1)), F), axis=1) S = T.nnet.softmax(F) LS = T.sum(T.log(T.maximum(T.sum(Y * S, 1), 1e-16))) if not linear_model: KL_U = -0.5 * (T.sum(KmmInv.T * T.sum(mu_scaled[:,None,:]*mu_scaled[None,:,:], 2)) + K * (T.sum(KmmInv.T * L_scaled.dot(L_scaled.T)) - M - 2.0*T.sum(T.log(T.diag(L_scaled))) + 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm)))))) else: KL_U = 0 #KL_U = -0.5 * T.sum(T.sum(mu_scaled * KmmInv.dot(mu_scaled), 0) + T.sum(KmmInv * L_scaled.dot(L_scaled.T)) - M # - 2.0*T.sum(T.log(T.diag(L_scaled))) + 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm))))) if not linear_model else 0 else: # mu_scaled, L_scaled = mu / sf2**0.5, L / sf2**0.5 mu_scaled, L_scaled = mu / sf2, L / sf2 X = m + s * eps_NQ U = mu_scaled + L_scaled.dot(eps_MK) Kmn = ker.RBF(sf2, l, Z, X) if not linear_model else ker.LIN(sl2, Z, X) Knn = ker.RBFnn(sf2, l, X) if not linear_model else ker.LINnn(sl2, X) B = Knn - T.sum(Kmn * KmmInv.dot(Kmn), 0) F = Kmn.T.dot(U) + T.maximum(B, 1e-16)[:,None]**0.5 * eps_NK F = T.concatenate((T.zeros((N,1)), F), axis=1) S = T.nnet.softmax(F) LS = T.sum(T.log(T.maximum(T.sum(Y * S, 1), 1e-16))) if not linear_model: KL_U = -0.5 * (T.sum(Kmm.T * T.sum(mu_scaled[:,None,:]*mu_scaled[None,:,:], 2)) + K * (T.sum(Kmm.T * L_scaled.dot(L_scaled.T)) - M - 2.0*T.sum(T.log(T.diag(L_scaled))) - 2.0*T.sum(T.log(T.diag(sT.cholesky(Kmm)))))) else: KL_U = 0 KL_X_all = -0.5 * T.sum((m**2.0 + s**2.0)/sx2 - 1.0 - 2.0*ls + T.log(sx2), 1) KL_X = T.sum(KL_X_all) print 'Compiling...' inputs = {'Y': Y, 'Z': Z, 'm': m, 'ls': ls, 'mu': mu, 'lL': lL, 'lhyp': lhyp, 'KmmInv': KmmInv, 'eps_MK': eps_MK, 'eps_NQ': eps_NQ, 'eps_NK': eps_NK} z = 0.0*sum([T.sum(v) for v in inputs.values()]) # solve a bug with derivative wrt inputs not in the graph f = zip(['X', 'U', 'S', 'LS', 'KL_U', 'KL_X', 'KL_X_all'], [X, U, S, LS, KL_U, KL_X, KL_X_all]) self.f = {n: theano.function(inputs.values(), f+z, name=n, on_unused_input='ignore') for n,f in f} g = zip(['LS', 'KL_U', 'KL_X'], [LS, KL_U, KL_X]) wrt = {'Z': Z, 'm': m, 'ls': ls, 'mu': mu, 'lL': lL, 'lhyp': lhyp, 'KmmInv': KmmInv} self.g = {vn: {gn: theano.function(inputs.values(), T.grad(gv+z, vv), name='d'+gn+'_d'+vn, on_unused_input='ignore') for gn,gv in g} for vn, vv in wrt.iteritems()} with open(model_file_name, 'wb') as file_handle: print 'Saving model...' sys.setrecursionlimit(2000) cPickle.dump([self.f, self.g, self.f_Kmm, self.f_KmmInv, self.dKmm_d], file_handle, protocol=cPickle.HIGHEST_PROTOCOL)
def __init__(self, params,correct, samples = 20,batch_size=None): ker = kernel() self.samples = samples self.params = params self.batch_size=batch_size #データの保存ファイル model_file_name = 'model2' + '.save' #もしこれまでに作ったのがあるならロードする try: print ('Trying to load model...') with open(model_file_name, 'rb') as file_handle: obj = pickle.load(file_handle) self.f, self.g,self.ES_US= obj print ('Loaded!') return except: print ('Failed. Creating a new model...') X,Y,X_test,m,S_b,Z,eps_NQ,eps_M=\ T.dmatrices('X','Y','X_test','m','S_b','Z','eps_NQ','eps_M') mu,Sigma=T.dmatrices('mu','Sigma') lhyp = T.dvector('lhyp') ls=T.dvector('ls') N,Q= m.shape M=Z.shape[0] D=X.shape[1] #変数の正の値への制約条件 beta = T.exp(ls[0]) #beta=T.exp(lhyp[0]) sf2, l = T.exp(lhyp[0]), T.exp(lhyp[1:1+Q]) S=T.exp(S_b) Xtilda = m + S * eps_NQ print ('Setting up cache...') Kmm = ker.RBF(sf2, l, Z) KmmInv = sT.matrix_inverse(Kmm) #KmmDet=theano.sandbox.linalg.det(Kmm) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) rv_u = srng.normal((2,N,Q)) rv_s = srng.normal((2,N,Q)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 xx_s=m.reshape([1,N,Q])+S.reshape([1,N,Q])*rv_s xxx_s=xx_s.reshape([2,N,1,Q]) zz=Z.reshape([1,1,M,Q]) rbf_u=T.exp(-T.sum(((xxx_s-zz)**2)/(2*l.reshape([1,1,1,Q])),-1))*sf2#N×M A=Kmm+beta*T.sum(T.mean(rbf_u.reshape([2,M,1,N])*rbf_u.reshape([2,1,M,N]),0),-1) Ainv=sT.matrix_inverse(A) Sigma_f=T.dot(Kmm,T.dot(Ainv,Kmm)) xx=m.reshape([1,N,Q])+S.reshape([1,N,Q])*rv_u xxx=xx.reshape([2,N,1,Q]) rbf=T.mean(T.exp(-T.sum(((xxx-zz)**2)/(2*l.reshape([1,1,1,Q])),-1)),0)#N×M RHS=T.sum(rbf.reshape([M,1,N])*X.reshape([1,D,N]),2) mu_f=beta*T.dot(Kmm,T.dot(Ainv,RHS)) self.ES_US = theano.function([m,S_b,Z,X,lhyp,ls], [mu_f,Sigma_f],on_unused_input='ignore') rv_u_d = srng.normal((N,Q)) rv_s_d = srng.normal((N,Q)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 Xtilda_u = m + S * rv_u_d Xtilda_s = m + S * rv_s_d Kmn_u = ker.RBF(sf2, l, Z, Xtilda_u) Kmn_s = ker.RBF(sf2, l, Z, Xtilda_s) print ('Modeling...') Kmn = ker.RBF(sf2,l,Z,Xtilda) Knn = ker.RBF(sf2,l,Xtilda,Xtilda) Ktilda=Knn-T.dot(Kmn.T,T.dot(KmmInv,Kmn)) Kinterval=T.dot(KmmInv,Kmn) #スケール変換 Sigma_L=sT.cholesky(Sigma) U = mu+Sigma_L.dot(eps_M) mean_U=T.dot(Kinterval.T,U) Covariance = beta LL = (self.log_mvn(X, mean_U, Covariance) - 0.5*beta*T.sum((T.eye(N)*Ktilda)))*correct KL_X = -self.KLD_X(m,S)*correct KL_U = -self.KLD_U(mu, Sigma_L, Kmm,KmmInv) print ('Compiling model ...') inputs = {'X': X, 'Z': Z, 'm': m, 'S_b': S_b, 'mu': mu, 'Sigma': Sigma, 'lhyp': lhyp, 'ls': ls, 'eps_M': eps_M, 'eps_NQ': eps_NQ} z = 0.0*sum([T.sum(v) for v in inputs.values()]) # solve a bug with derivative wrt inputs not in the graph self.f = {n: theano.function(list(inputs.values()), f+z, name=n, on_unused_input='ignore')\ for n,f in zip(['X', 'U', 'LL', 'KL_U', 'KL_X'], [X, U, LL, KL_U, KL_X])} wrt = {'Z': Z, 'm': m, 'S_b': S_b, 'lhyp': lhyp, 'ls': ls} self.g = {vn: {gn: theano.function(list(inputs.values()), T.grad(gv+z, vv), name='d'+gn+'_d'+vn, on_unused_input='ignore') for gn,gv in zip(['LL', 'KL_U', 'KL_X'], [LL, KL_U, KL_X])} for vn, vv in wrt.items()} with open(model_file_name, 'wb') as file_handle: print ('Saving model...') sys.setrecursionlimit(2000) pickle.dump([self.f, self.g,self.ES_US], file_handle, protocol=pickle.HIGHEST_PROTOCOL)
def s_variance(K, y, var_y, prior_var, K_new, var_min): rK = psd(prior_var * K + var_y * TT.eye(y.shape[0])) L = cholesky(rK) v = dots(matrix_inverse(L), prior_var * K_new) var_x = TT.maximum(prior_var - (v ** 2).sum(axis=0), var_min) return var_x
def __init__(self, params, correct, samples=20, batch_size=None): ker = kernel() self.samples = samples self.params = params self.batch_size = batch_size #データの保存ファイル model_file_name = 'model2' + '.save' #もしこれまでに作ったのがあるならロードする try: print('Trying to load model...') with open(model_file_name, 'rb') as file_handle: obj = pickle.load(file_handle) self.f, self.g, self.ES_US = obj print('Loaded!') return except: print('Failed. Creating a new model...') X,Y,X_test,m,S_b,Z,eps_NQ,eps_M=\ T.dmatrices('X','Y','X_test','m','S_b','Z','eps_NQ','eps_M') mu, Sigma = T.dmatrices('mu', 'Sigma') lhyp = T.dvector('lhyp') ls = T.dvector('ls') N, Q = m.shape M = Z.shape[0] D = X.shape[1] #変数の正の値への制約条件 beta = T.exp(ls[0]) #beta=T.exp(lhyp[0]) sf2, l = T.exp(lhyp[0]), T.exp(lhyp[1:1 + Q]) S = T.exp(S_b) Xtilda = m + S * eps_NQ print('Setting up cache...') Kmm = ker.RBF(sf2, l, Z) KmmInv = sT.matrix_inverse(Kmm) #KmmDet=theano.sandbox.linalg.det(Kmm) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) rv_u = srng.normal((2, N, Q)) rv_s = srng.normal((2, N, Q)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 xx_s = m.reshape([1, N, Q]) + S.reshape([1, N, Q]) * rv_s xxx_s = xx_s.reshape([2, N, 1, Q]) zz = Z.reshape([1, 1, M, Q]) rbf_u = T.exp(-T.sum(((xxx_s - zz)**2) / (2 * l.reshape([1, 1, 1, Q])), -1)) * sf2 #N×M A = Kmm + beta * T.sum( T.mean( rbf_u.reshape([2, M, 1, N]) * rbf_u.reshape([2, 1, M, N]), 0), -1) Ainv = sT.matrix_inverse(A) Sigma_f = T.dot(Kmm, T.dot(Ainv, Kmm)) xx = m.reshape([1, N, Q]) + S.reshape([1, N, Q]) * rv_u xxx = xx.reshape([2, N, 1, Q]) rbf = T.mean( T.exp(-T.sum(((xxx - zz)**2) / (2 * l.reshape([1, 1, 1, Q])), -1)), 0) #N×M RHS = T.sum(rbf.reshape([M, 1, N]) * X.reshape([1, D, N]), 2) mu_f = beta * T.dot(Kmm, T.dot(Ainv, RHS)) self.ES_US = theano.function([m, S_b, Z, X, lhyp, ls], [mu_f, Sigma_f], on_unused_input='ignore') rv_u_d = srng.normal((N, Q)) rv_s_d = srng.normal((N, Q)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 Xtilda_u = m + S * rv_u_d Xtilda_s = m + S * rv_s_d Kmn_u = ker.RBF(sf2, l, Z, Xtilda_u) Kmn_s = ker.RBF(sf2, l, Z, Xtilda_s) print('Modeling...') Kmn = ker.RBF(sf2, l, Z, Xtilda) Knn = ker.RBF(sf2, l, Xtilda, Xtilda) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) Kinterval = T.dot(KmmInv, Kmn) #スケール変換 Sigma_L = sT.cholesky(Sigma) U = mu + Sigma_L.dot(eps_M) mean_U = T.dot(Kinterval.T, U) Covariance = beta LL = (self.log_mvn(X, mean_U, Covariance) - 0.5 * beta * T.sum( (T.eye(N) * Ktilda))) * correct KL_X = -self.KLD_X(m, S) * correct KL_U = -self.KLD_U(mu, Sigma_L, Kmm, KmmInv) print('Compiling model ...') inputs = { 'X': X, 'Z': Z, 'm': m, 'S_b': S_b, 'mu': mu, 'Sigma': Sigma, 'lhyp': lhyp, 'ls': ls, 'eps_M': eps_M, 'eps_NQ': eps_NQ } z = 0.0 * sum([ T.sum(v) for v in inputs.values() ]) # solve a bug with derivative wrt inputs not in the graph self.f = {n: theano.function(list(inputs.values()), f+z, name=n, on_unused_input='ignore')\ for n,f in zip(['X', 'U', 'LL', 'KL_U', 'KL_X'], [X, U, LL, KL_U, KL_X])} wrt = {'Z': Z, 'm': m, 'S_b': S_b, 'lhyp': lhyp, 'ls': ls} self.g = { vn: { gn: theano.function(list(inputs.values()), T.grad(gv + z, vv), name='d' + gn + '_d' + vn, on_unused_input='ignore') for gn, gv in zip(['LL', 'KL_U', 'KL_X'], [LL, KL_U, KL_X]) } for vn, vv in wrt.items() } with open(model_file_name, 'wb') as file_handle: print('Saving model...') sys.setrecursionlimit(2000) pickle.dump([self.f, self.g, self.ES_US], file_handle, protocol=pickle.HIGHEST_PROTOCOL)