def test_sum_kernel_grad(): npr.seed(1) eps = 1e-5 N = 10 M = 5 D = 3 kernel1 = Matern52(D) kernel2 = Matern52(D) kernel3 = Matern52(D) kernel = SumKernel(kernel1, kernel2, kernel3) data1 = npr.randn(N, D) data2 = npr.randn(M, D) loss = np.sum(kernel.cross_cov(data1, data2)) dloss = kernel.cross_cov_grad_data(data1, data2).sum(0) dloss_est = np.zeros(dloss.shape) for i in xrange(M): for j in xrange(D): data2[i, j] += eps loss_1 = np.sum(kernel.cross_cov(data1, data2)) data2[i, j] -= 2 * eps loss_2 = np.sum(kernel.cross_cov(data1, data2)) data2[i, j] += eps dloss_est[i, j] = ((loss_1 - loss_2) / (2 * eps)) assert np.linalg.norm(dloss - dloss_est) < 1e-6
def test_backward_pass(): npr.seed(1) eps = 1e-5 N = 15 D = 10 data = 0.5*npr.rand(N,D) norm = Normalization(3) norm_inds = [1,3,5] bw = BetaWarp(2) bw_inds = [0,2] lin = Linear(3) lin_inds = [6,8,9] t = Transformer(D) # Add a layer and test the gradient t.add_layer((norm, norm_inds), (bw, bw_inds), (lin, lin_inds)) new_data = t.forward_pass(data) loss = np.sum(new_data**2) V = 2*new_data dloss = t.backward_pass(V) dloss_est = np.zeros(dloss.shape) for i in xrange(N): for j in xrange(D): data[i,j] += eps loss_1 = np.sum(t.forward_pass(data)**2) data[i,j] -= 2*eps loss_2 = np.sum(t.forward_pass(data)**2) data[i,j] += eps dloss_est[i,j] = ((loss_1 - loss_2) / (2*eps)) assert np.linalg.norm(dloss - dloss_est) < 1e-6 # Add a second layer and test the gradient t.add_layer(Linear(9)) new_data = t.forward_pass(data) loss = np.sum(new_data**2) V = 2*new_data dloss = t.backward_pass(V) dloss_est = np.zeros(dloss.shape) for i in xrange(N): for j in xrange(D): data[i,j] += eps loss_1 = np.sum(t.forward_pass(data)**2) data[i,j] -= 2*eps loss_2 = np.sum(t.forward_pass(data)**2) data[i,j] += eps dloss_est[i,j] = ((loss_1 - loss_2) / (2*eps)) assert np.linalg.norm(dloss - dloss_est) < 1e-6
def kernel(self, x1, x2=None, grad=False): if x2 is None: x2 = x1 cov = np.ones((x1.shape[0], x2.shape[0])) if grad: Ks = list() dKs = list() cov_grad = np.zeros((x1.shape[0], 1, x2.shape[1])) for i in xrange(len(self.kernels)): (K, dK) = self.kernels[i].kernel(x1[:, self.dim_indices[i]], x2[:, self.dim_indices[i]], grad) Ks.append(K) dKs.append(dK) cov = cov * K for i in xrange(len(self.kernels)): cov_grad[:, :, self.dim_indices[i]] = ( cov_grad[:, :, self.dim_indices[i]] + dKs[i] * (cov / Ks[i])[:, :, np.newaxis]) return (cov, cov_grad) else: for i in xrange(len(self.kernels)): cov = cov * self.kernels[i].kernel( x1[:, self.dim_indices[i]], x2[:, self.dim_indices[i]], grad) return cov
def test_backward_pass(): npr.seed(1) eps = 1e-5 N = 10 D = 5 nl = NormLin(D) data = 0.5 * npr.rand(N, D) new_data = nl.forward_pass(data) loss = np.sum(new_data**2) V = 2 * new_data dloss = nl.backward_pass(V) dloss_est = np.zeros(dloss.shape) for i in xrange(N): for j in xrange(D): data[i, j] += eps loss_1 = np.sum(nl.forward_pass(data)**2) data[i, j] -= 2 * eps loss_2 = np.sum(nl.forward_pass(data)**2) data[i, j] += eps dloss_est[i, j] = ((loss_1 - loss_2) / (2 * eps)) assert np.linalg.norm(dloss - dloss_est) < 1e-6
def test_grad(): npr.seed(1) eps = 1e-5 N = 10 M = 5 D = 5 inds = [0, 2, 4] kernel = Subset(D, Matern52(len(inds)), inds) data1 = npr.randn(N, D) data2 = npr.randn(M, D) loss = np.sum(kernel.cross_cov(data1, data2)) dloss = kernel.cross_cov_grad_data(data1, data2).sum(0) dloss_est = np.zeros(dloss.shape) for i in xrange(M): for j in xrange(D): data2[i, j] += eps loss_1 = np.sum(kernel.cross_cov(data1, data2)) data2[i, j] -= 2 * eps loss_2 = np.sum(kernel.cross_cov(data1, data2)) data2[i, j] += eps dloss_est[i, j] = ((loss_1 - loss_2) / (2 * eps)) print('Subset kernel grad using indices %s:' % inds) print(dloss) assert np.linalg.norm(dloss - dloss_est) < 1e-6
def test_predict(): npr.seed(1) N = 10 Npend = 3 Ntest = 2 D = 5 gp = GPClassifier(D, burnin=5, num_fantasies=7) pred = npr.rand(Ntest,D) # Test with 0 points mu, v = gp.predict(pred) np.testing.assert_allclose(mu, 0, rtol=1e-7, atol=0, err_msg='', verbose=True) np.testing.assert_allclose(v, 1+1e-6, rtol=1e-7, atol=0, err_msg='', verbose=True) #Test with 1 point X = np.zeros((1,D)) W = npr.randn(D,1) val = X.dot(W).flatten() > 0 gp.fit(X, val, fit_hypers=False) mu, v = gp.predict(pred) # Points closer to the origin will have less variance and a larger mean mu, v = gp.predict(np.tile(np.linspace(0,1,100)[:,None],(1,D))) assert np.all(np.diff(mu) > 0) and np.all(np.diff(v) > 0) # Now let's make sure it doesn't break with more data and pending inputs = 0.5*npr.rand(N,D) vals = inputs.dot(W).flatten() > 0 pending = npr.rand(Npend,D) gp.fit(inputs, vals, pending) mu, v = gp.predict(pred) # Now let's check the gradients eps = 1e-5 mu, v, dmu, dv = gp.predict(pred, compute_grad=True) # The implied loss is np.sum(mu**2) + np.sum(v**2) dloss = 2*(dmu*mu[:,np.newaxis,:]).sum(2) + 2*(v[:,np.newaxis,np.newaxis]*dv).sum(2) dloss_est = np.zeros(dloss.shape) for i in xrange(Ntest): for j in xrange(D): pred[i,j] += eps mu, v = gp.predict(pred) loss_1 = np.sum(mu**2) + np.sum(v**2) pred[i,j] -= 2*eps mu, v = gp.predict(pred) loss_2 = np.sum(mu**2) + np.sum(v**2) pred[i,j] += eps dloss_est[i,j] = ((loss_1 - loss_2) / (2*eps)) assert np.linalg.norm(dloss - dloss_est) < 1e-5
def grad_dist2(ls, x1, x2=None): if x2 is None: x2 = x1 # Rescale. x1 = x1 / ls x2 = x2 / ls N = x1.shape[0] M = x2.shape[0] D = x1.shape[1] gX = np.zeros((x1.shape[0],x2.shape[0],x1.shape[1])) code = \ """ for (int i=0; i<N; i++) for (int j=0; j<M; j++) for (int d=0; d<D; d++) gX(i,j,d) = (2/ls(d))*(x1(i,d) - x2(j,d)); """ try: scipy.weave.inline(code, ['x1','x2','gX','ls','M','N','D'], \ type_converters=scipy.weave.converters.blitz, \ compiler='gcc') except: # The C code weave above is 10x faster than this: for i in xrange(0,x1.shape[0]): gX[i,:,:] = 2*(x1[i,:] - x2[:,:])*(1/ls) return gX
def paramify_and_print(self, data_vector, left_indent=0, indent_top_row=False): params = self.paramify(data_vector) indentation = ' ' * left_indent if indent_top_row: sys.stderr.write(indentation) sys.stderr.write('NAME TYPE VALUE\n') sys.stderr.write(indentation) sys.stderr.write('---- ---- -----\n') for param_name, param in items(params): if param['type'] == 'float': format_str = '%s%-12.12s %-9.9s %-12f\n' elif param['type'] == 'enum': format_str = '%s%-12.12s %-9.9s %-12s\n' else: format_str = '%s%-12.12s %-9.9s %-12d\n' for i in xrange(len(param['values'])): if i == 0: sys.stderr.write(format_str % (indentation, param_name, param['type'], param['values'][i])) else: sys.stderr.write(format_str % (indentation, '', param['values'][i]))
def variables_config_to_meta(self, variables_config): """ Converts a dict of variable meta-information from a config-file format into a format that can be more easily used by bayesopt routines. """ # Stores the metadata for the dataset that allows a conversion # from a config file representation into a matrix representation. # The main addition that this variable adds is a mapping between # each variable and associated column indices in the matrix # representation. variables_meta = OrderedDict() cardinality = 0 # The number of distinct variables num_dims = 0 # The number of dimensions in the matrix representation for name, variable in items(variables_config): cardinality += variable['size'] vdict = { 'type': variable['type'].lower(), 'indices': [] } # indices stores a mapping from these variable(s) to their matrix column(s) if vdict['type'] == 'int': vdict['min'] = int(variable['min']) vdict['max'] = int(variable['max']) elif vdict['type'] == 'float': vdict['min'] = float(variable['min']) vdict['max'] = float(variable['max']) elif vdict['type'] == 'enum': vdict['options'] = list(variable['options']) else: raise Exception("Unknown variable type.") for i in xrange(variable['size']): if vdict['type'] == 'int': vdict['indices'].append(num_dims) num_dims += 1 elif vdict['type'] == 'float': vdict['indices'].append(num_dims) num_dims += 1 elif vdict['type'] == 'enum': vdict['indices'].append( list( np.arange(len(list(variable['options']))) + num_dims)) num_dims += len(list(variable['options'])) else: raise Exception("Unknown variable type.") variables_meta[name] = vdict return variables_meta, num_dims, cardinality
def test_grad(): npr.seed(1) eps = 1e-5 N = 10 M = 5 D = 5 beta_warp = BetaWarp(2) norm = Normalization(2) lin = Linear(D) transformer = Transformer(D) # Each entry is a tuple, (transformation, indices_it_acts_on) transformer.add_layer( (beta_warp, [0, 2]), (norm, [1, 4])) # This is crazy. We would never do this. # One transformation means apply to all dimensions. transformer.add_layer(lin) kernel = TransformKernel(Matern52(lin.num_factors), transformer) data1 = npr.rand(N, D) data2 = npr.rand(M, D) loss = np.sum(kernel.cross_cov(data1, data2)) dloss = kernel.cross_cov_grad_data(data1, data2).sum(0) dloss_est = np.zeros(dloss.shape) for i in xrange(M): for j in xrange(D): data2[i, j] += eps loss_1 = np.sum(kernel.cross_cov(data1, data2)) data2[i, j] -= 2 * eps loss_2 = np.sum(kernel.cross_cov(data1, data2)) data2[i, j] += eps dloss_est[i, j] = ((loss_1 - loss_2) / (2 * eps)) assert np.linalg.norm(dloss - dloss_est) < 1e-6
def create_task(): task_name = "mytask" task_type = "OBJECTIVE" variables_config = OrderedDict([('X', { "type": "INT", "size": 2, "min": -1, "max": 10 }), ('Y', { "type": "FLOAT", "size": 3, "min": -0.003, "max": 1e-1 }), ('Z', { "type": "ENUM", "size": 2, "options": ["one", "two", "three"] })]) variables_meta, num_dims, cardinality = Task.variables_config_to_meta( variables_config) # Create a set of inputs that satisfies the constraints of each variable X = np.zeros((10, num_dims)) for i in xrange(10): for name, variable in items(variables_meta): indices = variable['indices'] if variable['type'] == 'int': X[i, indices] = np.random.randint(variable['min'], variable['max'] + 1, len(indices)) elif variable['type'] == 'float': X[i, indices] = np.random.rand(len(indices)) * ( variable['max'] - variable['min']) + variable['min'] elif variable['type'] == 'enum': for ind in indices: cat = np.random.randint(len(ind)) X[i, ind[cat]] = 1 y = np.random.randn(10) t = Task(task_name, task_type, variables_config, data=X, values=y) return t
def test_fit(): npr.seed(1) N = 10 D = 5 burnin = 100 mcmc_iters = 100 num_pending = 3 num_fantasies = 2 gp = GPClassifier(D, burnin=burnin, mcmc_iters=mcmc_iters, num_fantasies=num_fantasies) inputs = np.vstack((0.1*npr.rand(N,D),npr.rand(N,D))) inputs[12] = np.ones(D) pending = npr.rand(3,D) W = npr.randn(D,1) vals = (inputs - inputs.mean(0)).dot(W).flatten() > 0 gp.fit(inputs, vals, pending) probs = np.zeros(inputs.shape[0]) for i in xrange(gp.num_states): gp.set_state(i) probs += (gp.latent_values.value > 0) / float(mcmc_iters) assert np.all(probs[:N] < 0.5) and np.all(probs[N:] > 0.5) assert gp.values.shape[0] == 2*N + num_pending assert gp.values.shape[1] == 2 assert gp.chain_length == burnin + mcmc_iters assert all([np.all(p.value != p.initial_value) for p in gp.params.values()]) assert len(gp._cache_list) == mcmc_iters assert len(gp._hypers_list) == mcmc_iters assert len(gp._latent_values_list) == mcmc_iters assert len(gp._fantasy_values_list) == mcmc_iters
return cur_x, cur_llh # return (cur_x, funEvals['funevals']) if returnFunEvals else cur_x if __name__ == '__main__': npr.seed(1) import pylab as pl import pymc D = 10 fn = lambda x: -0.5 * np.sum(x**2) iters = 1000 samps = np.zeros((iters, D)) for ii in xrange(1, iters): samps[ii, :] = slice_sample(samps[ii - 1, :], fn, sigma=0.1, step_out=False, doubling_step=True, verbose=False) ll = -0.5 * np.sum(samps**2, axis=1) scores = pymc.geweke(ll) pymc.Matplot.geweke_plot(scores, 'test') pymc.raftery_lewis(ll, q=0.025, r=0.01) pymc.Matplot.autocorrelation(ll, 'test')
def test_predict(): npr.seed(1) N = 10 Npend = 3 Ntest = 2 D = 5 gp = GP(D, burnin=5, num_fantasies=7) pred = npr.rand(Ntest, D) # Test with 0 points mu, v = gp.predict(pred) np.testing.assert_allclose(mu, 0, rtol=1e-7, atol=0, err_msg='', verbose=True) np.testing.assert_allclose(v, 1 + 1e-6, rtol=1e-7, atol=0, err_msg='', verbose=True) #Test with 1 point X = np.zeros((1, D)) W = npr.randn(D, 1) val = X.dot(W).flatten() + np.sqrt(1e-3) * npr.randn() gp.fit(X, val, fit_hypers=False) mu, v = gp.predict(pred) # Points closer to the origin will have less variance if np.linalg.norm(pred[0] - X) < np.linalg.norm(pred[1] - X): assert v[0] < v[1] else: assert v[0] > v[1] # Predict at the point itself mu, v = gp.predict(X) np.testing.assert_allclose(mu, val, rtol=1e-5, atol=0, err_msg='', verbose=True) # Now let's make sure it doesn't break with more data and pending inputs = npr.rand(N, D) vals = inputs.dot(W).flatten() + np.sqrt(1e-3) * npr.randn(N) pending = npr.rand(Npend, D) gp.fit(inputs, vals, pending) mu, v = gp.predict(pred) # Now let's check the gradients eps = 1e-5 mu, v, dmu, dv = gp.predict(pred, compute_grad=True) # The implied loss is np.sum(mu**2) + np.sum(v**2) dloss = 2 * (dmu * mu[:, np.newaxis, :]).sum(2) + 2 * ( v[:, np.newaxis, np.newaxis] * dv).sum(2) dloss_est = np.zeros(dloss.shape) for i in xrange(Ntest): for j in xrange(D): pred[i, j] += eps mu, v = gp.predict(pred) loss_1 = np.sum(mu**2) + np.sum(v**2) pred[i, j] -= 2 * eps mu, v = gp.predict(pred) loss_2 = np.sum(mu**2) + np.sum(v**2) pred[i, j] += eps dloss_est[i, j] = ((loss_1 - loss_2) / (2 * eps)) assert np.linalg.norm(dloss - dloss_est) < 1e-6
def fast_chol_add(L, A): U = L.T # Add a row and column to U # Assume that you can pass in a cholesky that's the same # size as the kernel (then the last row/col will be clobbered) if U.shape[0] < A.shape[0]: G = np.zeros(A.shape) G[:U.shape[0], :U.shape[1]] = U U = G (rows,cols) = A.shape isPosDef = 1; j = rows-1 try: code = \ """ double s = 0; for (int i=0; i<cols; i++) { s = A(i,j); for (int ind=0; ind<i; ind++) s -= U(ind,i) * U(ind,j); if (i == j) { if (s <= 0) { isPosDef = 0; U(i,i) = 0; } else { U(i,i) = sqrt(s); } } else { if (U(i,i) > 0) { U(i,j) = s / U(i,i); } else { U(i,j) = 0; } } } """ scipy.weave.inline(code, ['U','A','j','isPosDef','rows','cols'], \ type_converters=scipy.weave.converters.blitz, \ compiler='gcc') except: k = np.arange(cols) for i in xrange(cols): j = rows-1; s = A[i,j] - np.dot(U[k[:i],i].T,U[k[:i],j]) if i == j: if s <= 0: isPosDef = 0 U[i,i] = 0 else: U[i,i] = np.sqrt(s) else: if U[i,i] > 0: U[i,j] = s / U[i,i] else: U[i,j] = 0 L = U.T return L, isPosDef
def geweke_correctness_test(self): print('Initiating Geweke Correctness test') # Note: the horseshoe prior on the noise will make the line slightly not straight # because we don't have the actual log pdf import matplotlib.pyplot as plt # First, check that all priors and models can be sampled from for param in self.hypers: if not hasattr(param.prior, 'sample'): print('Prior of param %s cannot be sampled from. Cannot perform the Geweke correctness test.' % param.name) return n = 10000 # number of samples # n = self.mcmc_iters statistic_of_interest = np.mean true_data = copy.copy(self.data) # reset this at the end # Case A: # 1) Draw new hypers from priors # 2) Draw new data given hypers (**NOT** given hypers and data !!!!) caseA = np.zeros(n) for i in xrange(n): if i % 1000 == 0: print('Geweke Part A Sample %d/%d' % (i,n)) for param in self.hypers: param.sample_from_prior() latent_y = self.sample_from_prior_given_hypers(self.data) # only inputs used # fants = latent_y fants = self.observation_model(latent_y) # self.noise.print_diagnostics() # print fants caseA[i] = statistic_of_interest(fants) # Case B: # 1) Resample all hypers one step given data # 2) Resample data given hypers # repeat a bunch of times caseB = np.zeros(n) for i in xrange(n): if i % 1000 == 0: print('Geweke Part B Sample %d/%d' % (i, n)) # Take MCMC step on theta given data self.sampler.generate_sample() # data['inputs'] and data['values'] used # Resample data latent_y = self.sample_from_prior_given_hypers(self.data) # only data['inputs'] used # self.data['values'] = latent_y self.data['values'] = self.observation_model(latent_y) # add noise # self.noise.print_diagnostics() # print self.data['values'] caseB[i] = statistic_of_interest(self.data['values']) print(np.mean(caseA)) print(np.std(caseA)) print(np.mean(caseB)) print(np.std(caseB)) # Then, sort the sets A and B. caseA = np.sort(caseA) caseB = np.sort(caseB) # Then for each a in A, take the fraction of B smaller than it. yAxis = np.zeros(n) for i in xrange(n): yAxis[i] = np.sum(caseB < caseA[i]) / float(n) xAxis = np.arange(n)/float(n) # Plot fractional index of a vs this fraction. # Repeat for all a in A so number of points on graph is |A| ( = |B| ) if not os.path.isdir('diagnostics'): os.mkdir('diagnostics') if not os.path.isdir('diagnostics/correctness'): os.mkdir('diagnostics/correctness') plt.figure(1) plt.clf() plt.plot(xAxis, yAxis, 'b') plt.plot(xAxis, xAxis, '--r') plt.title('Geweke test P-P plot with %d samples' % n) plt.savefig('diagnostics/correctness/GewekeCorrectness_%d_samples.pdf' % n) self.data = true_data