def define_process(self): # Prior self.prior_freedom = self.freedom() self.prior_mean = self.location_space self.prior_covariance = self.kernel_f_space * self.prior_freedom self.prior_variance = tnl.extract_diag(self.prior_covariance) self.prior_std = tt.sqrt(self.prior_variance) self.prior_noise = tt.sqrt(tnl.extract_diag(self.kernel_space * self.prior_freedom)) self.prior_median = self.prior_mean sigma = 2 self.prior_quantile_up = self.prior_mean + sigma * self.prior_std self.prior_quantile_down = self.prior_mean - sigma * self.prior_std self.prior_noise_up = self.prior_mean + sigma * self.prior_noise self.prior_noise_down = self.prior_mean - sigma * self.prior_noise self.prior_sampler = self.prior_mean + self.random_scalar * cholesky_robust(self.prior_covariance).dot(self.random_th) # Posterior self.posterior_freedom = self.prior_freedom + self.inputs.shape[1] beta = (self.mapping_outputs - self.location_inputs).T.dot(tsl.solve(self.kernel_inputs, self.mapping_outputs - self.location_inputs)) coeff = (self.prior_freedom + beta - 2)/(self.posterior_freedom - 2) self.posterior_mean = self.location_space + self.kernel_f_space_inputs.dot( tsl.solve(self.kernel_inputs, self.mapping_outputs - self.location_inputs)) self.posterior_covariance = coeff * (self.kernel_f.cov(self.space_th) - self.kernel_f_space_inputs.dot( tsl.solve(self.kernel_inputs, self.kernel_f_space_inputs.T))) self.posterior_variance = tnl.extract_diag(self.posterior_covariance) self.posterior_std = tt.sqrt(self.posterior_variance) self.posterior_noise = coeff * tt.sqrt(tnl.extract_diag(self.kernel.cov(self.space_th) - self.kernel_f_space_inputs.dot( tsl.solve(self.kernel_inputs, self.kernel_f_space_inputs.T)))) self.posterior_median = self.posterior_mean self.posterior_quantile_up = self.posterior_mean + sigma * self.posterior_std self.posterior_quantile_down = self.posterior_mean - sigma * self.posterior_std self.posterior_noise_up = self.posterior_mean + sigma * self.posterior_noise self.posterior_noise_down = self.posterior_mean - sigma * self.posterior_noise self.posterior_sampler = self.posterior_mean + self.random_scalar * cholesky_robust(self.posterior_covariance).dot(self.random_th)
def __init__(self, tau2_0=0.1, sigma2_0=0.1, l_0=0.1, eta=0.1, debug=1): """ :type sigma_0: float :param sigma_0: starting value for variance. :type l_0: float :param l_0: starting value for length scale. :type eta: float :param eta: learning rate :type debug: int :param debug: verbosity """ print "GP Initing..." if debug > 0 else 0 ################################################## #### Prepare the -loglik gradient descent ##Init the shared vars X = T.dmatrix('X') f = T.dmatrix('f') self.tau2 = theano.shared(tau2_0) self.l = theano.shared(l_0) self.sigma2 = theano.shared(sigma2_0) #Make the covar matrix K = self.covFunc(X, X, self.l) #Get a numerically safe decomp L = LA.cholesky(K + self.tau2 * T.identity_like(K)) #Calculate the weights for each of the training data; predictions are a weighted sum. alpha = LA.solve(T.transpose(L), LA.solve(L, f)) ##Calculate - log marginal likelihood nloglik = -T.reshape( -0.5 * T.dot(T.transpose(f), alpha) - T.sum(T.log(T.diag(L))), []) #Get grad grads = [ T.grad(nloglik, self.tau2), T.grad(nloglik, self.l), T.grad(nloglik, self.sigma2) ] #Updates, make sure to keep the params positive updates = [ (var, T.max([var - eta * grad, 0.1])) for var, grad in zip([self.tau2, self.l, self.sigma2], grads) ] self._gd = theano.function(inputs=[X, f], updates=updates) print "Done" if debug > 0 else 0
def inv_as_solve(node): if not imported_scipy: return False if isinstance(node.op, (Dot, Dot22)): l, r = node.inputs if l.owner and l.owner.op == matrix_inverse: return [solve(l.owner.inputs[0], r)] if r.owner and r.owner.op == matrix_inverse: if is_symmetric(r.owner.inputs[0]): return [solve(r.owner.inputs[0], l.T).T] else: return [solve(r.owner.inputs[0].T, l.T).T]
def subprocess_gp(self, subkernel, cov=False, noise=False): k_ni = subkernel.cov(self.space, self.inputs) mu = self.mean(self.space) + k_ni.dot(sL.solve(self.cov_inputs, self.inv_outputs - self.mean_inputs)) if noise: k_cov = self.kernel.cov(self.space) - k_ni.dot(sL.solve(self.cov_inputs, k_ni.T)) else: k_cov = self.kernel_f.cov(self.space) - k_ni.dot(sL.solve(self.cov_inputs, k_ni.T)) var = nL.extract_diag(debug(k_cov, 'k_cov')) if cov: return mu, var, k_cov else: return mu, var
def th_define_process(self): #print('stochastic_define_process') # Basic Tensors self.mapping_outputs = tt_to_num(self.f_mapping.inv(self.th_outputs)) self.mapping_latent = tt_to_num(self.f_mapping(self.th_outputs)) #self.mapping_scalar = tt_to_num(self.f_mapping.inv(self.th_scalar)) self.prior_location_space = self.f_location(self.th_space) self.prior_location_inputs = self.f_location(self.th_inputs) self.prior_kernel_space = tt_to_cov(self.f_kernel_noise.cov(self.th_space)) self.prior_kernel_inputs = tt_to_cov(self.f_kernel_noise.cov(self.th_inputs)) self.prior_cholesky_space = cholesky_robust(self.prior_kernel_space) self.prior_kernel_f_space = self.f_kernel.cov(self.th_space) self.prior_kernel_f_inputs = self.f_kernel.cov(self.th_inputs) self.prior_cholesky_f_space = cholesky_robust(self.prior_kernel_f_space) self.cross_kernel_space_inputs = tt_to_num(self.f_kernel_noise.cov(self.th_space, self.th_inputs)) self.cross_kernel_f_space_inputs = tt_to_num(self.f_kernel.cov(self.th_space, self.th_inputs)) self.posterior_location_space = self.prior_location_space + self.cross_kernel_space_inputs.dot( tsl.solve(self.prior_kernel_inputs, self.mapping_outputs - self.prior_location_inputs)) self.posterior_location_f_space = self.prior_location_space + self.cross_kernel_f_space_inputs.dot( tsl.solve(self.prior_kernel_inputs, self.mapping_outputs - self.prior_location_inputs)) self.posterior_kernel_space = self.prior_kernel_space - self.cross_kernel_space_inputs.dot( tsl.solve(self.prior_kernel_inputs, self.cross_kernel_space_inputs.T)) self.posterior_cholesky_space = cholesky_robust(self.posterior_kernel_space) self.posterior_kernel_f_space = self.prior_kernel_f_space - self.cross_kernel_f_space_inputs.dot( tsl.solve(self.prior_kernel_inputs, self.cross_kernel_f_space_inputs.T)) self.posterior_cholesky_f_space = cholesky_robust(self.posterior_kernel_f_space) self.prior_kernel_diag_space = tt_to_bounded(tnl.extract_diag(self.prior_kernel_space), zero32) self.prior_kernel_diag_f_space = tt_to_bounded(tnl.extract_diag(self.prior_kernel_f_space), zero32) self.posterior_kernel_diag_space = tt_to_bounded(tnl.extract_diag(self.posterior_kernel_space), zero32) self.posterior_kernel_diag_f_space = tt_to_bounded(tnl.extract_diag(self.posterior_kernel_f_space), zero32) self.prior_kernel_sd_space = tt.sqrt(self.prior_kernel_diag_space) self.prior_kernel_sd_f_space = tt.sqrt(self.prior_kernel_diag_f_space) self.posterior_kernel_sd_space = tt.sqrt(self.posterior_kernel_diag_space) self.posterior_kernel_sd_f_space = tt.sqrt(self.posterior_kernel_diag_f_space) self.prior_cholesky_diag_space = tnl.alloc_diag(self.prior_kernel_sd_space) self.prior_cholesky_diag_f_space = tnl.alloc_diag(self.prior_kernel_sd_f_space) self.posterior_cholesky_diag_space = tnl.alloc_diag(self.posterior_kernel_sd_space) self.posterior_cholesky_diag_f_space = tnl.alloc_diag(self.posterior_kernel_sd_f_space)
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) se_for_each_sample = ((network_output - prediction_func)**2).ravel() params = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in params]) J = compute_jacobian(se_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - slinalg.solve( J.T.dot(J) + new_mu * T.eye(n_params), J.T.dot(se_for_each_sample)) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def __init__(self, f1, vs1, f2, vs2): self.f1 = f1 self.f2 = f2 self.vs1 = vs1 self.vs2 = vs2 self.sz1 = [shape(v)[0] for v in self.vs1] self.sz2 = [shape(v)[0] for v in self.vs2] for i in range(1, len(self.sz1)): self.sz1[i] += self.sz1[i-1] self.sz1 = [(0 if i==0 else self.sz1[i-1], self.sz1[i]) for i in range(len(self.sz1))] for i in range(1, len(self.sz2)): self.sz2[i] += self.sz2[i-1] self.sz2 = [(0 if i==0 else self.sz2[i-1], self.sz2[i]) for i in range(len(self.sz2))] self.df1 = grad(self.f1, vs1) self.new_vs1 = [tt.vector() for v in self.vs1] self.func1 = th.function(self.new_vs1, [-self.f1, -self.df1], givens=zip(self.vs1, self.new_vs1)) def f1_and_df1(x0): return self.func1(*[x0[a:b] for a, b in self.sz1]) self.f1_and_df1 = f1_and_df1 J = jacobian(grad(f1, vs2), vs1) H = hessian(f1, vs1) g = grad(f2, vs1) self.df2 = -tt.dot(J, ts.solve(H, g))+grad(f2, vs2) self.func2 = th.function([], [-self.f2, -self.df2]) def f2_and_df2(x0): for v, (a, b) in zip(self.vs2, self.sz2): v.set_value(x0[a:b]) self.maximize1() return self.func2() self.f2_and_df2 = f2_and_df2
def test_solve_dtype(self): pytest.importorskip("scipy") dtypes = [ "uint8", "uint16", "uint32", "uint64", "int8", "int16", "int32", "int64", "float16", "float32", "float64", ] A_val = np.eye(2) b_val = np.ones((2, 1)) # try all dtype combinations for A_dtype, b_dtype in itertools.product(dtypes, dtypes): A = tensor.matrix(dtype=A_dtype) b = tensor.matrix(dtype=b_dtype) x = solve(A, b) fn = function([A, b], x) x_result = fn(A_val.astype(A_dtype), b_val.astype(b_dtype)) assert x.dtype == x_result.dtype
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) se_for_each_sample = ( (network_output - prediction_func) ** 2 ).ravel() params = parameter_values(self.connection) param_vector = parameters2vector(self) J = compute_jacobian(se_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - slinalg.solve( J.T.dot(J) + new_mu * T.eye(n_params), J.T.dot(se_for_each_sample) ) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def lp_to_phi0_fs(lp, Tobs): Tobs2 = Tobs * Tobs Tobs3 = Tobs2 * Tobs phis = tts.solve(poly_basis_to_legendre_basis, lp) phi0 = phis[0] f0 = phis[1] / (2.0 * pi * Tobs) fdot = phis[2] / (pi * Tobs2) fddot = phis[3] / (pi / 3.0 * Tobs3) return (phi0, f0, fdot, fddot)
def second_moments(i, j, M2, beta, R, logk_c, logk_r, z_, Sx, *args): # This comes from Deisenroth's thesis ( Eqs 2.51- 2.54 ) Rij = R[i, j] n2 = logk_c[i] + logk_r[j] n2 += utils.maha(z_[i], -z_[j], 0.5 * solve(Rij, Sx)) Q = tt.exp(n2) / tt.sqrt(det(Rij)) # Eq 2.55 m2 = matrix_dot(beta[i], Q, beta[j]) m2 = theano.ifelse.ifelse(tt.eq(i, j), m2 + 1e-6, m2) M2 = tt.set_subtensor(M2[i, j], m2) return M2
def test_local_lift_solve(): A = tensor.fmatrix() b = tensor.fmatrix() o = slinalg.solve(A, b) f_cpu = theano.function([A, b], o) f_gpu = theano.function([A, b], o, mode=mode_with_gpu) assert not any(isinstance(n.op, slinalg.Solve) for n in f_gpu.maker.fgraph.apply_nodes) assert any(isinstance(n.op, GpuCusolverSolve) for n in f_gpu.maker.fgraph.apply_nodes) A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32") b_val = numpy.random.uniform(-0.4, 0.4, (5, 3)).astype("float32") utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
def __init__(self, f1, vs1, f2, vs2): # creates a isolated var of the vars self.f1 = f1 self.f2 = f2 self.vs1 = vs1 self.vs2 = vs2 #----- # self.sz1 = [ shape(v)[0] for v in self.vs1 ] # converts from tensor variable to normal array (uses also some math magic) self.sz2 = [shape(v)[0] for v in self.vs2] for i in range(1, len( self.sz1)): # adds together all future sz1 with old sz1 self.sz1[i] += self.sz1[i - 1] self.sz1 = [ (0 if i == 0 else self.sz1[i - 1], self.sz1[i]) for i in range(len(self.sz1)) ] #makes the array into a 2d-array with (prevoius var, var) for i in range(1, len(self.sz2)): # same thing az sz1 self.sz2[i] += self.sz2[i - 1] self.sz2 = [(0 if i == 0 else self.sz2[i - 1], self.sz2[i]) for i in range(len(self.sz2))] # samme thing as sz1 self.df1 = grad(self.f1, vs1) # IMPORTANT: VERY SLOW self.new_vs1 = [tt.vector() for v in self.vs1 ] # back from normal array to tensorVector self.func1 = th.function( self.new_vs1, [-self.f1, -self.df1], givens=zip(self.vs1, self.new_vs1)) # IMPORTANT: VERY VERY VERY SLOW def f1_and_df1(x0): return self.func1(*[x0[a:b] for a, b in self.sz1]) self.f1_and_df1 = f1_and_df1 J = jacobian(grad(f1, vs2), vs1) # IMPORTANT: VERY VERY VERY VERY SLOW H = hessian(f1, vs1) # IMPORTANT: VERY VERY VERY VERY SLOW g = grad(f2, vs1) # IMPORTANT: SLOW self.df2 = -tt.dot(J, ts.solve(H, g)) + grad(f2, vs2) # IMPORTANT: SLOW self.func2 = th.function( [], [-self.f2, -self.df2]) # IMPORTANT: EXREMELY SLOW def f2_and_df2(x0): for v, (a, b) in zip(self.vs2, self.sz2): v.set_value(x0[a:b]) self.maximize1() return self.func2() self.f2_and_df2 = f2_and_df2
def grad(self, inputs, g_outputs): # let A = I - rho W, and dRho(A) be the derivatrive of A wrt Rho # dRho(log(|(AtA)|)) = dRho(log(|(AtA)|))) # = dRho(log(|At|) + log(|A|)) # = dRho(log(|A| + log(|A|))) # = 2 dRho(log(|A|)) # = 2 |A|^{-1} dRho(|A|) = 2|A|^{-1} tr(Adj(A)dRho(A)) # = 2 |A|^{-1} |A| tr(A^{-1}(-W)) = 2 * tr(A^{-1}W) [gz] = g_outputs [rho] = inputs A = self.I - rho * self.W trAiW = slinalg.solve(A, self.W).diagonal().sum() #trAiW = (nlinalg.matrix_inverse(A).dot(self.W)).diagonal().sum() return [trAiW]
def test_local_lift_solve(): if not cusolver_available: raise SkipTest('No cuSolver') A = tensor.fmatrix() b = tensor.fmatrix() o = slinalg.solve(A, b) f_cpu = theano.function([A, b], o, mode_without_gpu) f_gpu = theano.function([A, b], o, mode=mode_with_gpu) assert not any(isinstance(n.op, slinalg.Solve) for n in f_gpu.maker.fgraph.apply_nodes) assert any(isinstance(n.op, GpuCusolverSolve) and n.op.inplace for n in f_gpu.maker.fgraph.apply_nodes) A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32") b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32") utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
def test_gpu_solve_not_inplace(): if not cusolver_available: raise SkipTest('No cuSolver') A = tensor.fmatrix() b = tensor.fmatrix() s = slinalg.solve(A, b) o = tensor.dot(A, s) f_cpu = theano.function([A, b], o, mode_without_gpu) f_gpu = theano.function([A, b], o, mode=mode_with_gpu) count_not_inplace = len([n.op for n in f_gpu.maker.fgraph.apply_nodes if isinstance(n.op, GpuCusolverSolve) and not n.op.inplace]) assert count_not_inplace == 1, count_not_inplace A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32") b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32") utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
def test_local_lift_solve(): A = tensor.fmatrix() b = tensor.fmatrix() o = slinalg.solve(A, b) f_cpu = theano.function([A, b], o) f_gpu = theano.function([A, b], o, mode=mode_with_gpu) assert not any( isinstance(n.op, slinalg.Solve) for n in f_gpu.maker.fgraph.apply_nodes) assert any( isinstance(n.op, GpuCusolverSolve) for n in f_gpu.maker.fgraph.apply_nodes) A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32") b_val = numpy.random.uniform(-0.4, 0.4, (5, 3)).astype("float32") utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
def init_train_updates(self): n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in parameters]) penalty_const = asfloat(self.penalty_const) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) updated_parameters = param_vector - slinalg.solve( hessian_matrix + penalty_const * T.eye(n_parameters), full_gradient) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def th_cross_mean(self, prior=False, noise=False, cross_kernel=None): """ Using two kernels calculate the media of one process given the other. :param prior: if the process considers a prior of not :param noise: if the process considers noise :param cross_kernel: it's the covariance between two process :return: returns a tensor with the location of a process given another process. """ if prior: return self.prior_location_space if cross_kernel is None: cross_kernel = self.f_kernel return self.prior_location_space + cross_kernel.cov( self.th_space_, self.th_inputs_).dot( tsl.solve(self.prior_kernel_inputs, self.mapping_outputs - self.prior_location_inputs))
def init_train_updates(self): n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = parameters2vector(self) penalty_const = asfloat(self.penalty_const) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters ) updated_parameters = param_vector - slinalg.solve( hessian_matrix + penalty_const * T.eye(n_parameters), full_gradient ) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def logp(self, value): """ the sparse cached log determinant assumes I - rho W = A, and computes the log determinant of A wrt rho with cached W. To get this right with the SMA, we need to use -rho in the logdet. """ delta = value - self.mu ld = self.spld(-self.rho) out = -self.W.n / 2.0 * tt.log(np.pi * self.scale) out -= ld kern = slinalg.solve(self.AAt, delta) kern = tt.mul(delta, kern) kern = kern.sum() kern *= self.scale**-2 kern /= 2.0 return out - kern
def test_solve_dtype(self): if not imported_scipy: raise SkipTest("Scipy needed for the Solve op.") dtypes = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64'] A_val = numpy.eye(2) b_val = numpy.ones((2, 1)) # try all dtype combinations for A_dtype, b_dtype in itertools.product(dtypes, dtypes): A = tensor.matrix(dtype=A_dtype) b = tensor.matrix(dtype=b_dtype) x = solve(A, b) fn = function([A, b], x) x_result = fn(A_val.astype(A_dtype), b_val.astype(b_dtype)) assert x.dtype == x_result.dtype
def test_solve_dtype(self): if not imported_scipy: raise SkipTest("Scipy needed for the Solve op.") dtypes = [ 'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64' ] A_val = np.eye(2) b_val = np.ones((2, 1)) # try all dtype combinations for A_dtype, b_dtype in itertools.product(dtypes, dtypes): A = tensor.matrix(dtype=A_dtype) b = tensor.matrix(dtype=b_dtype) x = solve(A, b) fn = function([A, b], x) x_result = fn(A_val.astype(A_dtype), b_val.astype(b_dtype)) assert x.dtype == x_result.dtype
def predict(self, mx, Sx, *args, **kwargs): if self.N < self.n_inducing: # stick with the full GP return GP_UI.predict(self, mx, Sx) idims = self.D odims = self.E # centralize inputs zeta = self.X_sp - mx # initialize some variables sf2 = self.hyp[:, idims]**2 eyeE = tt.tile(tt.eye(idims), (odims, 1, 1)) lscales = self.hyp[:, :idims] iL = eyeE/lscales.dimshuffle(0, 1, 'x') # predictive mean inp = iL.dot(zeta.T).transpose(0, 2, 1) iLdotSx = iL.dot(Sx) B = (iLdotSx[:, :, None, :]*iL[:, None, :, :]).sum(-1) + tt.eye(idims) t = tt.stack([solve(B[i].T, inp[i].T).T for i in range(odims)]) c = sf2/tt.sqrt(tt.stack([det(B[i]) for i in range(odims)])) l_ = tt.exp(-0.5*tt.sum(inp*t, 2)) lb = l_*self.beta_sp M = tt.sum(lb, 1)*c # input output covariance tiL = tt.stack([t[i].dot(iL[i]) for i in range(odims)]) V = tt.stack([tiL[i].T.dot(lb[i]) for i in range(odims)]).T*c # predictive covariance logk = (tt.log(sf2))[:, None] - 0.5*tt.sum(inp*inp, 2) logk_r = logk.dimshuffle(0, 'x', 1) logk_c = logk.dimshuffle(0, 1, 'x') Lambda = tt.square(iL) LL = (Lambda.dimshuffle(0, 'x', 1, 2) + Lambda).transpose(0, 1, 3, 2) R = tt.dot(LL, Sx.T).transpose(0, 1, 3, 2) + tt.eye(idims) z_ = Lambda.dot(zeta.T).transpose(0, 2, 1) M2 = tt.zeros((odims, odims)) # initialize indices triu_indices = np.triu_indices(odims) indices = [tt.as_index_variable(idx) for idx in triu_indices] def second_moments(i, j, M2, beta, iK, sf2, R, logk_c, logk_r, z_, Sx): # This comes from Deisenroth's thesis ( Eqs 2.51- 2.54 ) Rij = R[i, j] n2 = logk_c[i] + logk_r[j] n2 += utils.maha(z_[i], -z_[j], 0.5*solve(Rij, Sx)) Q = tt.exp(n2)/tt.sqrt(det(Rij)) # Eq 2.55 m2 = matrix_dot(beta[i], Q, beta[j]) m2 = theano.ifelse.ifelse( tt.eq(i, j), m2 - tt.sum(iK[i]*Q) + sf2[i], m2) M2 = tt.set_subtensor(M2[i, j], m2) M2 = theano.ifelse.ifelse( tt.eq(i, j), M2 + 1e-6, tt.set_subtensor(M2[j, i], m2)) return M2 nseq = [self.beta_sp, (self.iKmm - self.iBmm), sf2, R, logk_c, logk_r, z_, Sx] M2_, updts = theano.scan( fn=second_moments, sequences=indices, outputs_info=[M2], non_sequences=nseq, allow_gc=False) M2 = M2_[-1] S = M2 - tt.outer(M, M) return M, S, V
def grad(self, inputs, g_outputs): gz, = g_outputs x, = inputs return [slinalg.solve(x.T, gz)]
def predict_symbolic(self, mx, Sx, unroll_scan=False): idims = self.D odims = self.E # centralize inputs zeta = self.X - mx # initialize some variables sf2 = self.hyp[:, idims]**2 eyeE = tt.tile(tt.eye(idims), (odims, 1, 1)) lscales = self.hyp[:, :idims] iL = eyeE / lscales.dimshuffle(0, 1, 'x') # predictive mean inp = iL.dot(zeta.T).transpose(0, 2, 1) iLdotSx = iL.dot(Sx) # TODO vectorize this B = (iLdotSx[:, :, None, :] * iL[:, None, :, :]).sum(-1) + tt.eye(idims) t = tt.stack([solve(B[i].T, inp[i].T).T for i in range(odims)]) c = sf2 / tt.sqrt(tt.stack([det(B[i]) for i in range(odims)])) l = tt.exp(-0.5 * tt.sum(inp * t, 2)) lb = l * self.beta # E x N dot E x N M = tt.sum(lb, 1) * c # input output covariance tiL = (t[:, :, None, :] * iL[:, None, :, :]).sum(-1) # tiL = tt.stack([t[i].dot(iL[i]) for i in range(odims)]) V = tt.stack([tiL[i].T.dot(lb[i]) for i in range(odims)]).T * c # predictive covariance logk = (tt.log(sf2))[:, None] - 0.5 * tt.sum(inp * inp, 2) logk_r = logk.dimshuffle(0, 'x', 1) logk_c = logk.dimshuffle(0, 1, 'x') Lambda = tt.square(iL) LL = (Lambda.dimshuffle(0, 'x', 1, 2) + Lambda).transpose(0, 1, 3, 2) R = tt.dot(LL, Sx).transpose(0, 1, 3, 2) + tt.eye(idims) z_ = Lambda.dot(zeta.T).transpose(0, 2, 1) M2 = tt.zeros((odims, odims)) # initialize indices triu_indices = np.triu_indices(odims) indices = [tt.as_index_variable(idx) for idx in triu_indices] def second_moments(i, j, M2, beta, iK, sf2, R, logk_c, logk_r, z_, Sx, *args): # This comes from Deisenroth's thesis ( Eqs 2.51- 2.54 ) Rij = R[i, j] n2 = logk_c[i] + logk_r[j] n2 += utils.maha(z_[i], -z_[j], 0.5 * solve(Rij, Sx)) Q = tt.exp(n2) / tt.sqrt(det(Rij)) # Eq 2.55 m2 = matrix_dot(beta[i], Q, beta[j]) m2 = theano.ifelse.ifelse(tt.eq(i, j), m2 - tt.sum(iK[i] * Q) + sf2[i], m2) M2 = tt.set_subtensor(M2[i, j], m2) return M2 nseq = [self.beta, self.iK, sf2, R, logk_c, logk_r, z_, Sx, self.L] if unroll_scan: from lasagne.utils import unroll_scan [M2_] = unroll_scan(second_moments, indices, [M2], nseq, len(triu_indices[0])) updts = {} else: M2_, updts = theano.scan(fn=second_moments, sequences=indices, outputs_info=[M2], non_sequences=nseq, allow_gc=False, strict=True, name="%s>M2_scan" % (self.name)) M2 = M2_[-1] M2 = M2 + tt.triu(M2, k=1).T S = M2 - tt.outer(M, M) return M, S, V
def predict_symbolic(self, mx, Sx=None, unroll_scan=False): idims = self.D odims = self.E # initialize some variables sf2 = self.hyp[:, idims]**2 eyeE = tt.tile(tt.eye(idims), (odims, 1, 1)) lscales = self.hyp[:, :idims] iL = eyeE / lscales.dimshuffle(0, 1, 'x') if Sx is None: # first check if we received a vector [D] or a matrix [nxD] if mx.ndim == 1: mx = mx[None, :] # centralize inputs zeta = self.X[:, None, :] - mx[None, :, :] # predictive mean ( we don't need to do the rest ) inp = (iL[:, None, :, None, :] * zeta[:, None, :, :]).sum(2) l = tt.exp(-0.5 * tt.sum(inp**2, -1)) lb = l * self.beta[:, :, None] # E x N M = tt.sum(lb, 1).T * sf2 # apply saturating function to the output if available if self.sat_func is not None: # saturate the output M = self.sat_func(M) return M # centralize inputs zeta = self.X - mx # predictive mean inp = iL.dot(zeta.T).transpose(0, 2, 1) iLdotSx = iL.dot(Sx) B = (iLdotSx[:, :, None, :] * iL[:, None, :, :]).sum(-1) + tt.eye(idims) t = tt.stack([solve(B[i].T, inp[i].T).T for i in range(odims)]) c = sf2 / tt.sqrt(tt.stack([det(B[i]) for i in range(odims)])) l = tt.exp(-0.5 * tt.sum(inp * t, 2)) lb = l * self.beta M = tt.sum(lb, 1) * c # input output covariance tiL = tt.stack([t[i].dot(iL[i]) for i in range(odims)]) V = tt.stack([tiL[i].T.dot(lb[i]) for i in range(odims)]).T * c # predictive covariance logk = (tt.log(sf2))[:, None] - 0.5 * tt.sum(inp * inp, 2) logk_r = logk.dimshuffle(0, 'x', 1) logk_c = logk.dimshuffle(0, 1, 'x') Lambda = tt.square(iL) LL = (Lambda.dimshuffle(0, 'x', 1, 2) + Lambda).transpose(0, 1, 3, 2) R = tt.dot(LL, Sx).transpose(0, 1, 3, 2) + tt.eye(idims) z_ = Lambda.dot(zeta.T).transpose(0, 2, 1) M2 = tt.zeros((odims, odims)) # initialize indices triu_indices = np.triu_indices(odims) indices = [tt.as_index_variable(idx) for idx in triu_indices] def second_moments(i, j, M2, beta, R, logk_c, logk_r, z_, Sx, *args): # This comes from Deisenroth's thesis ( Eqs 2.51- 2.54 ) Rij = R[i, j] n2 = logk_c[i] + logk_r[j] n2 += utils.maha(z_[i], -z_[j], 0.5 * solve(Rij, Sx)) Q = tt.exp(n2) / tt.sqrt(det(Rij)) # Eq 2.55 m2 = matrix_dot(beta[i], Q, beta[j]) m2 = theano.ifelse.ifelse(tt.eq(i, j), m2 + 1e-6, m2) M2 = tt.set_subtensor(M2[i, j], m2) return M2 nseq = [self.beta, R, logk_c, logk_r, z_, Sx, self.iK, self.L] if unroll_scan: from lasagne.utils import unroll_scan [M2_] = unroll_scan(second_moments, indices, [M2], nseq, len(triu_indices[0])) updts = {} else: M2_, updts = theano.scan(fn=second_moments, sequences=indices, outputs_info=[M2], non_sequences=nseq, allow_gc=False, strict=True, name="%s>M2_scan" % (self.name)) M2 = M2_[-1] M2 = M2 + tt.triu(M2, k=1).T S = M2 - tt.outer(M, M) # apply saturating function to the output if available if self.sat_func is not None: # saturate the output M, S, U = self.sat_func(M, S) # compute the joint input output covariance V = V.dot(U) return M, S, V
def inverse_map(self, y): return slinalg.solve(self.weights, (y - self.biases).T).T
#theano.config.exception_verbosity = 'high' from scipy import optimize with pm.Model() as model: # 2 state model P = pm.Dirichlet('P', a=np.ones((N_states,N_states)), shape=(N_states,N_states)) A1 = pm.Normal('A1',mu=0, sd=0.3) A2 = pm.Normal('A2',mu=1, sd=0.3) S1 = pm.InverseGamma('S1',alpha=alphaS, beta=betaS) S2 = pm.InverseGamma('S2',alpha=alphaS, beta=betaS) AA = tt.dmatrix('AA') AA = tt.eye(N_states) - P + tt.ones(shape=(N_states,N_states)) PA = pm.Deterministic('PA',sla.solve(AA.T,tt.ones(shape=(N_states)))) states1 = HMMStatesN('states1',P=P,PA=PA, shape=len(dataset[4])) emission1 = HMMGaussianEmissions('emission1', A1=A1, A2=A1, S1=S1, S2=S2, states=states1, observed = dataset[4]) states2 = HMMStatesN('states2',P=P,PA=PA, shape=len(dataset[205])) emission2 = HMMGaussianEmissions('emission2', A1=A1,