def set_dataset(self, X_dataset, Y_dataset, X_cov=None, Y_var=None): # set dataset super(GP, self).set_dataset(X_dataset, Y_dataset) # extra operations when setting the dataset (specific to this class) if X_cov is not None: self.X_cov = X_cov self.nigp = S(np.zeros((self.E, self.N)), name="%s>nigp" % (self.name)) if Y_var is not None: if self.Y_var is None: self.Y_var = S(Y_var, name='%s>Y_var' % (self.name), borrow=True) else: self.Y_var.set_value(Y_var, borrow=True) if not self.trained: # init log hyperparameters and intermediate variables self.init_params() # we should be saving, since we updated the trianing dataset self.state_changed = True if self.N > 0: self.ready = True
def __pcpy__(self, nnt, **kwd): """ shallowly paste parameter values onto another network of exact topology. nnt: the target network to print parameter values. if None, a new network is created. kwd: dictionary of additional keywords. return: the target neural network with parameter pasted. for a recursive deep copy, use hlp.cp instead. """ if not self.__homo__(nnt): raise ValueError('cannot cp parameters to different shapes.') # parameters of target, they are shared tensors par = nnt.__parm__() dct = nnt.__dict__ for k, v in self.__parm__().items(): # get source parameter values: v = v.get_value() # update values in the target if k in par: par[k].set_value(v) elif k in dct: # k is a member but not a shared tensor raise ValueError('cannot cp to non-shared-tensor.') else: dct[k] = S(v) # creat new member if possible # done return nnt
def set_params(self, params, trainable=False): ''' Adds a a new parameter to the class instance. Every parameter will be stored as a Theano shared variable. This function exists so that we do not end up with different compiled functions referencing different shared variables in memory; which can be a problem when loading pickled compiled theano functions ''' if isinstance(params, list): params = dict(list(zip(self.param_names, params))) for pname in list(params.keys()): # if the parameter that was passed here is a shared variable if isinstance(params[pname], tt.sharedvar.SharedVariable): p = params[pname] self.__dict__[pname] = p if pname not in self.param_names: self.param_names.append(pname) # if the parameter that was passed here is NOT a shared variable else: # create shared variable if it doesn't exist if pname not in self.__dict__ or self.__dict__[pname] is None: p = S(params[pname], name='%s>%s' % (self.name, pname)) self.__dict__[pname] = p if pname not in self.param_names: self.param_names.append(pname) # otherwise, update the value of the shared variable else: p = self.__dict__[pname] pv = params[pname].reshape(p.get_value().shape) p.set_value(pv)
def get_loss(self, unroll_scan=False, cache_intermediate=True): msg = 'Building full GP loss' utils.print_with_stamp(msg, self.name) idims = self.D N = self.X.shape[0].astype(floatX) def nlml(Y, hyp, i, X, EyeN, nigp=None, y_var=None): # initialise the (before compilation) kernel function hyps = (hyp[:idims + 1], hyp[idims + 1]) kernel_func = partial(cov.Sum, hyps, self.covs) # We initialise the kernel matrices (one for each output dimension) K = kernel_func(X) # add the contribution from the input noise if nigp: K += tt.diag(nigp[i]) # add the contribution from the output uncertainty (acts as weight) if y_var: K += tt.diag(y_var[i]) # compute chol(K) L = Cholesky()(K) # compute K^-1 and (K^-1)dot(y) rhs = tt.concatenate([EyeN, Y[:, None]], axis=1) sol = solve_upper_triangular(L.T, solve_lower_triangular(L, rhs)) iK = sol[:, :-1] beta = sol[:, -1] return iK, L, beta nseq = [self.X, tt.eye(self.X.shape[0])] if self.nigp: nseq.append(self.nigp) if self.Y_var: nseq.append(self.Y_var.T) seq = [self.Y.T, self.hyp, tt.arange(self.X.shape[0])] if unroll_scan: from lasagne.utils import unroll_scan [iK, L, beta] = unroll_scan(nlml, seq, [], nseq, self.E) updts = {} else: (iK, L, beta), updts = theano.scan(fn=nlml, sequences=seq, non_sequences=nseq, allow_gc=False, strict=True, return_list=True, name="%s>logL_scan" % (self.name)) # And finally, the negative log marginal likelihood loss = 0.5 * tt.sum(self.Y.T * beta, 1) idx = [theano.tensor.arange(L.shape[i]) for i in [1, 2]] loss += tt.sum(tt.log(L[:, idx[0], idx[1]]), 1) loss += 0.5 * N * tt.log(2 * np.pi) if cache_intermediate: # we are going to save the intermediate results in the following # shared variables, so we can use them during prediction without # having to recompute them N, E = self.N, self.E if type(self.iK) is not tt.sharedvar.SharedVariable: self.iK = S(np.tile(np.eye(N, dtype=floatX), (E, 1, 1)), name="%s>iK" % (self.name)) if type(self.L) is not tt.sharedvar.SharedVariable: self.L = S(np.tile(np.eye(N, dtype=floatX), (E, 1, 1)), name="%s>L" % (self.name)) if type(self.beta) is not tt.sharedvar.SharedVariable: self.beta = S(np.ones((E, N), dtype=floatX), name="%s>beta" % (self.name)) updts = [(self.iK, iK), (self.L, L), (self.beta, beta)] else: # save intermediate graphs (in case we require grads wrt params) self.iK, self.L, self.beta = iK, L, beta updts = None # we add some penalty to avoid having parameters that are too large if self.snr_penalty is not None: penalty_params = { 'log_snr': np.log(1000, dtype=floatX), 'log_ls': np.log(100, dtype=floatX), 'log_std': tt.log(self.X.std(0) * (N / (N - 1.0))), 'p': 30 } loss += self.snr_penalty(tt.log(self.hyp), **penalty_params) inps = [] self.state_changed = True # for saving return loss.sum(), inps, updts
def get_loss(self, unroll_scan=False, cache_intermediate=True): utils.print_with_stamp('Building Sparse Spectrum loss', self.name) idims = self.D if self.sr is None: self.sr = self.w/(self.hyp[:, :idims]) self.sr = self.sr.transpose(1, 0, 2) # init variables N = self.X.shape[0].astype(floatX) M = self.sr.shape[1].astype(floatX) Mi = 2*self.sr.shape[1] EyeM = tt.eye(Mi) sf2 = self.hyp[:, idims]**2 sf2M = (sf2/M).dimshuffle(0, 'x', 'x') sn2 = (self.hyp[:, idims+1]**2).dimshuffle(0, 'x', 'x') srdotX = self.sr.dot(self.X.T) phi_f = tt.concatenate([tt.sin(srdotX), tt.cos(srdotX)], axis=1) Phi_f = tt.batched_dot(phi_f, phi_f.transpose(0, 2, 1)) A = sf2M*Phi_f A += (sn2 + 1e-6)*EyeM phi_f_dotY = tt.batched_dot(phi_f, self.Y.T) def nlml(A, phidotY, EyeM): Lmm = Cholesky()(A) rhs = tt.concatenate([EyeM, phidotY[:, None]], axis=1) sol = solve_upper_triangular( Lmm.T, solve_lower_triangular(Lmm, rhs)) iA = sol[:, :-1] beta_ss = sol[:, -1] return iA, Lmm, beta_ss seq = [A, phi_f_dotY] nseq = [EyeM] if unroll_scan: from lasagne.utils import unroll_scan [iA, Lmm, beta_ss] = unroll_scan(nlml, seq, [], nseq, self.E) updts = {} else: (iA, Lmm, beta_ss), updts = theano.scan( fn=nlml, sequences=seq, non_sequences=nseq, allow_gc=False, return_list=True, name='%s>logL_ss' % (self.name)) # scale beta_ss beta_ss *= sf2M[:, :, 0] # And finally, the negative log marginal likelihood YdotY = tt.sum(self.Y**2, 0) Ydotphidotbeta = tt.sum(phi_f_dotY*beta_ss, -1) loss_ss = 0.5*(YdotY - Ydotphidotbeta)/sn2 idx = [theano.tensor.arange(Lmm.shape[i]) for i in [1, 2]] loss_ss += tt.sum(tt.log(Lmm[:, idx[0], idx[1]]), 1) loss_ss += (0.5*N - M)*tt.log(sn2) loss_ss += 0.5*N*np.log(2*np.pi, dtype=floatX) if cache_intermediate: # we are going to save the intermediate results in the following # shared variables, so we can use them during prediction without # having to recompute them kk = 2*self.n_inducing N, E = self.N, self.E if type(self.iA) is not tt.sharedvar.SharedVariable: self.iA = S(np.tile(np.eye(kk, dtype=floatX), (E, 1, 1)), name="%s>iA" % (self.name)) if type(self.Lmm) is not tt.sharedvar.SharedVariable: self.Lmm = S(np.tile(np.eye(kk, dtype=floatX), (E, 1, 1)), name="%s>Lmm" % (self.name)) if type(self.beta_ss) is not tt.sharedvar.SharedVariable: self.beta_ss = S(np.ones((E, kk), dtype=floatX), name="%s>beta_ss" % (self.name)) updts = [(self.iA, iA), (self.Lmm, Lmm), (self.beta_ss, beta_ss)] else: self.iA, self.Lmm, self.beta_ss = iA, Lmm, beta_ss updts = None # we add some penalty to avoid having parameters that are too large if self.snr_penalty is not None: penalty_params = {'log_snr': np.log(1000, dtype=floatX), 'log_ls': np.log(100, dtype=floatX), 'log_std': tt.log(self.X.std(0)*(N/(N-1.0))), 'p': 30} loss_ss += self.snr_penalty(tt.log(self.hyp), **penalty_params) # add a penalty for high frequencies freq_penalty = tt.square(self.w).sum(-1).mean(0) loss_ss = loss_ss + freq_penalty inps = [] self.state_changed = True # for saving return loss_ss.sum(), inps, updts
parser.add_argument('-b', type=float, default=0.0, dest='b') parser.add_argument('-N', type=int, default=1000, dest='N') wt = parser.parse_args().w bt = parser.parse_args().b iterations = parser.parse_args().N # generate training data set of four points equispaced in y ytarg = np.array([[1. / 8, 3. / 8, 5. / 8, 7. / 8]]) xtarg = invf(wt, bt, ytarg) # initial values for model sigmoid a = 1.0 b = 0.0 W = S(a) B = S(b) # symbolic computations for theano X = T.matrix() y = T.vector() sig = 1 / (1 + T.exp(-T.dot(X, W) - B)) xent = -y * T.log(sig) - (1 - y) * T.log(1 - sig) cost = xent.mean() gw, gb = T.grad(cost, [W, B]) # compile theano functions TRAIN = F(inputs=[X, y], outputs=[W, B],
def get_loss(self, cache_intermediate=True): if self.N < self.n_inducing: # initialize the training loss function of the GP class return super(SPGP, self).get_loss( cache_intermediate=cache_intermediate) else: utils.print_with_stamp('Building FITC loss', self.name) self.should_recompile = False odims = self.E idims = self.D N = self.X.shape[0].astype(theano.config.floatX) # initialize the training loss function of the sparse FITC # approximation def nlml(Y, hyp, X, X_sp, EyeM): # TODO allow for different pseudo inputs for each dimension # initialise the (before compilation) kernel function hyps = [hyp[:idims+1], hyp[idims+1]] kernel_func = partial(cov.Sum, hyps, self.covs) sf2 = hyp[idims]**2 sn2 = hyp[idims+1]**2 N = X.shape[0].astype(theano.config.floatX) ridge = 1e-6 Kmm = kernel_func(X_sp) + ridge*EyeM Kmn = kernel_func(X_sp, X) Lmm = cholesky(Kmm) rhs = tt.concatenate([EyeM, Kmn], axis=1) sol = solve_lower_triangular(Lmm, rhs) iKmm = solve_upper_triangular(Lmm.T, sol[:, :EyeM.shape[0]]) Lmn = sol[:, EyeM.shape[0]:] diagQnn = (Lmn**2).sum(0) # Gamma = diag(Knn - Qnn) + sn2*I Gamma = sf2 + sn2 - diagQnn Gamma_inv = 1.0/Gamma # these operations are done to avoid inverting Qnn+Gamma) sqrtGamma_inv = tt.sqrt(Gamma_inv) Lmn_ = Lmn*sqrtGamma_inv # Kmn_*Gamma^-.5 Yi = Y*(sqrtGamma_inv) # Gamma^-.5* Y # I + Lmn * Gamma^-1 * Lnm Bmm = tt.eye(Kmm.shape[0]) + (Lmn_).dot(Lmn_.T) Amm = cholesky(Bmm) LAmm = Lmm.dot(Amm) Kmn_dotYi = Kmn.dot(Yi*(sqrtGamma_inv)) rhs = tt.concatenate([EyeM, Kmn_dotYi[:, None]], axis=1) sol = solve_upper_triangular( LAmm.T, solve_lower_triangular(LAmm, rhs)) iBmm = sol[:, :-1] beta_sp = sol[:, -1] log_det_K_sp = tt.sum(tt.log(Gamma)) log_det_K_sp += 2*tt.sum(tt.log(tt.diag(Amm))) loss_sp = Yi.dot(Yi) - Kmn_dotYi.dot(beta_sp) loss_sp += log_det_K_sp + N*np.log(2*np.pi) loss_sp *= 0.5 return loss_sp, iKmm, Lmm, Amm, iBmm, beta_sp r_outs, updts = theano.scan( fn=nlml, sequences=[self.Y.T, self.hyp], non_sequences=[self.X, self.X_sp, tt.eye(self.X_sp.shape[0])], allow_gc=False, return_list=True) (loss_sp, iKmm, Lmm, Amm, iBmm, beta_sp) = r_outs if cache_intermediate: # we are going to save the intermediate results in the # following shared variables, # so we can use them during prediction without having to # recompute them # initialize shared variables kk = self.n_inducing self.iKmm = S( np.tile(np.eye(kk).astype(floatX), (odims, 1, 1)), name="%s>iKmm" % (self.name)) self.Lmm = S( np.tile(np.eye(kk).astype(floatX), (odims, 1, 1)), name="%s>Lmm" % (self.name)) self.Amm = S( np.tile(np.eye(kk).astype(floatX), (odims, 1, 1)), name="%s>Amm" % (self.name)) self.iBmm = S( np.tile(np.eye(kk).astype(floatX), (odims, 1, 1)), name="%s>iBmm" % (self.name)) self.beta_sp = S( np.ones((self.E, kk)).astype(floatX), name="%s>beta_sp" % (self.name)) updts = [(self.iKmm, iKmm), (self.Lmm, Lmm), (self.Amm, Amm), (self.iBmm, iBmm), (self.beta_sp, beta_sp)] else: self.iKmm, self.Lmm, self.Amm = iKmm, Lmm, Amm self.iBmm, self.beta_sp = iBmm, beta_sp updts = None # we add some penalty to avoid having parameters that are too large if self.snr_penalty is not None: penalty_params = {'log_snr': np.log(1000), 'log_ls': np.log(100), 'log_std': tt.log( self.X_sp.std(0)*(N/(N-1.0))), 'p': 30} loss_sp += self.snr_penalty(self.hyp, **penalty_params) inps = [] self.state_changed = True # for saving return loss_sp.sum(), inps, updts