def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20, use_multiprocessing=True): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join( expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = int(pending_samples) self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales # If multiprocessing fails or deadlocks, set this to False self.use_multiprocessing = bool(int(use_multiprocessing))
def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.mcmc_iters = int(mcmc_iters) self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 self.noiseless = bool(int(noiseless)) self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales
def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20, use_multiprocessing=True): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join(expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = int(pending_samples) self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales # If multiprocessing fails or deadlocks, set this to False self.use_multiprocessing = bool(int(use_multiprocessing))
def __init__(self, expt_dir, variables=None, grid_size=None, grid_seed=1): self.expt_dir = expt_dir self.jobs_pkl = os.path.join(expt_dir, EXPERIMENT_GRID_FILE) self.locker = Locker() # Only one process at a time is allowed to have access to the grid. self.locker.lock_wait(self.jobs_pkl) # Set up the grid for the first time if it doesn't exist. if variables is not None and not os.path.exists(self.jobs_pkl): self.seed = grid_seed self.vmap = GridMap(variables, grid_size) self.grid = self._hypercube_grid(self.vmap.card(), grid_size) self.status = np.zeros(grid_size, dtype=int) + CANDIDATE_STATE self.values = np.zeros(grid_size) + np.nan self.durs = np.zeros(grid_size) + np.nan self.proc_ids = np.zeros(grid_size, dtype=int) self._save_jobs() # Or load in the grid from the pickled file. else: self._load_jobs()
class ExperimentGrid: @staticmethod def job_running(expt_dir, id): expt_grid = ExperimentGrid(expt_dir) expt_grid.set_running(id) @staticmethod def job_complete(expt_dir, id, value, duration): log("setting job %d complete" % id) expt_grid = ExperimentGrid(expt_dir) expt_grid.set_complete(id, value, duration) log("set...") @staticmethod def job_broken(expt_dir, id): expt_grid = ExperimentGrid(expt_dir) expt_grid.set_broken(id) def __init__(self, expt_dir, variables=None, grid_size=None, grid_seed=1): self.expt_dir = expt_dir self.jobs_pkl = os.path.join(expt_dir, EXPERIMENT_GRID_FILE) self.locker = Locker() # Only one process at a time is allowed to have access to the grid. self.locker.lock_wait(self.jobs_pkl) # Set up the grid for the first time if it doesn't exist. if variables is not None and not os.path.exists(self.jobs_pkl): self.seed = grid_seed self.vmap = GridMap(variables, grid_size) self.grid = self._hypercube_grid(self.vmap.card(), grid_size) self.status = np.zeros(grid_size, dtype=int) + CANDIDATE_STATE self.values = np.zeros(grid_size) + np.nan self.durs = np.zeros(grid_size) + np.nan self.proc_ids = np.zeros(grid_size, dtype=int) self._save_jobs() # Or load in the grid from the pickled file. else: self._load_jobs() def __del__(self): self._save_jobs() if self.locker.unlock(self.jobs_pkl): pass else: raise Exception("Could not release lock on job grid.\n") def get_grid(self): return self.grid, self.values, self.durs def get_candidates(self): return np.nonzero(self.status == CANDIDATE_STATE)[0] def get_pending(self): return np.nonzero((self.status == SUBMITTED_STATE) | (self.status == RUNNING_STATE))[0] def get_complete(self): return np.nonzero(self.status == COMPLETE_STATE)[0] def get_broken(self): return np.nonzero(self.status == BROKEN_STATE)[0] def get_params(self, index): return self.vmap.get_params(self.grid[index, :]) def get_best(self): finite = self.values[np.isfinite(self.values)] if len(finite) > 0: cur_min = np.min(finite) index = np.nonzero(self.values == cur_min)[0][0] return cur_min, index else: return np.nan, -1 def get_proc_id(self, id): return self.proc_ids[id] def add_to_grid(self, candidate): # Checks to prevent numerical over/underflow from corrupting the grid candidate[candidate > 1.0] = 1.0 candidate[candidate < 0.0] = 0.0 # Set up the grid self.grid = np.vstack((self.grid, candidate)) self.status = np.append(self.status, np.zeros(1, dtype=int) + int(CANDIDATE_STATE)) self.values = np.append(self.values, np.zeros(1) + np.nan) self.durs = np.append(self.durs, np.zeros(1) + np.nan) self.proc_ids = np.append(self.proc_ids, np.zeros(1, dtype=int)) # Save this out. self._save_jobs() return self.grid.shape[0] - 1 def set_candidate(self, id): self.status[id] = CANDIDATE_STATE self._save_jobs() def set_submitted(self, id, proc_id): self.status[id] = SUBMITTED_STATE self.proc_ids[id] = proc_id self._save_jobs() def set_running(self, id): self.status[id] = RUNNING_STATE self._save_jobs() def set_complete(self, id, value, duration): self.status[id] = COMPLETE_STATE self.values[id] = value self.durs[id] = duration self._save_jobs() def set_broken(self, id): self.status[id] = BROKEN_STATE self._save_jobs() def _load_jobs(self): fh = open(self.jobs_pkl, 'r') jobs = cPickle.load(fh) fh.close() self.vmap = jobs['vmap'] self.grid = jobs['grid'] self.status = jobs['status'] self.values = jobs['values'] self.durs = jobs['durs'] self.proc_ids = jobs['proc_ids'] def _save_jobs(self): # Write everything to a temporary file first. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump( { 'vmap': self.vmap, 'grid': self.grid, 'status': self.status, 'values': self.values, 'durs': self.durs, 'proc_ids': self.proc_ids }, fh, protocol=-1) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.jobs_pkl) os.system(cmd) # TODO: Should check system-dependent return status. def _hypercube_grid(self, dims, size): # Generate from a sobol sequence sobol_grid = np.transpose(i4_sobol_generate(dims, size, self.seed)) return sobol_grid
class GPEIOptChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20, use_multiprocessing=True): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join( expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = int(pending_samples) self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales # If multiprocessing fails or deadlocks, set this to False self.use_multiprocessing = bool(int(use_multiprocessing)) def dump_hypers(self): self.locker.lock_wait(self.state_pkl) # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='wb', delete=False) pickle.dump( { 'dims': self.D, 'ls': self.ls, 'amp2': self.amp2, 'noise': self.noise, 'hyper_samples': self.hyper_samples, 'mean': self.mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) # Write the hyperparameters out to a human readable file as well fh = open(self.stats_file, 'w') fh.write('Mean Noise Amplitude <length scales>\n') fh.write('-----------ALL SAMPLES-------------\n') meanhyps = 0 * np.hstack(self.hyper_samples[0]) for i in self.hyper_samples: hyps = np.hstack(i) meanhyps += (1 / float(len(self.hyper_samples))) * hyps for j in hyps: fh.write(str(j) + ' ') fh.write('\n') fh.write('-----------MEAN OF SAMPLES-------------\n') for j in meanhyps: fh.write(str(j) + ' ') fh.write('\n') fh.close() # This passes out html or javascript to display interesting # stats - such as the length scales (sensitivity to various # dimensions). def generate_stats_html(self): # Need this because the model may not necessarily be # initialized when this code is called. if not self._read_only(): return 'Chooser not yet ready to display output' mean_mean = np.mean(np.vstack([h[0] for h in self.hyper_samples])) mean_noise = np.mean(np.vstack([h[1] for h in self.hyper_samples])) mean_ls = np.mean( np.vstack([h[3][np.newaxis, :] for h in self.hyper_samples]), 0) try: output = ( '<br /><span class=\"label label-info\">Estimated mean:</span> ' + str(mean_mean) + '<br /><span class=\"label label-info\">Estimated noise:</span> ' + str(mean_noise) + '<br /><br /><span class=\"label label-info\">Inverse parameter sensitivity' + ' - Gaussian Process length scales</span><br /><br />' + '<div id=\"lschart\"></div><script type=\"text/javascript\">' + 'var lsdata = [' + ','.join(['%.2f' % i for i in mean_ls]) + '];') except: return 'Chooser not yet ready to display output.' output += ('bar_chart("#lschart", lsdata, ' + str(self.max_ls) + ');' + '</script>') return output # Read in the chooser from file. Returns True only on success def _read_only(self): if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = pickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.hyper_samples = state['hyper_samples'] self.needs_burnin = False return True return False def _real_init(self, dims, values): self.locker.lock_wait(self.state_pkl) self.randomstate = npr.get_state() if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = pickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.hyper_samples = state['hyper_samples'] self.needs_burnin = False else: # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values) + 1e-4 # Initial observation noise. self.noise = 1e-3 # Initial mean. self.mean = np.mean(values) # Save hyperparameter samples self.hyper_samples.append( (self.mean, self.noise, self.amp2, self.ls)) self.locker.unlock(self.state_pkl) def cov(self, x1, x2=None): if x2 is None: return self.amp2 * (self.cov_func(self.ls, x1, None) + 1e-6 * np.eye(x1.shape[0])) else: return self.amp2 * self.cov_func(self.ls, x1, x2) # Given a set of completed 'experiments' in the unit hypercube with # corresponding objective 'values', pick from the next experiment to # run according to the acquisition function. def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete]) # Grab out the relevant sets. comp = grid[complete, :] cand = grid[candidates, :] pend = grid[pending, :] vals = values[complete] numcand = cand.shape[0] # Spray a set of candidates around the min so far best_comp = np.argmin(vals) cand2 = np.vstack( (np.random.randn(10, comp.shape[1]) * 0.001 + comp[best_comp, :], cand)) if self.mcmc_iters > 0: # Possibly burn in. if self.needs_burnin: for mcmc_iter in range(self.burnin): self.sample_hypers(comp, vals) log("BURN %d/%d] mean: %.2f amp: %.2f " "noise: %.4f min_ls: %.4f max_ls: %.4f" % (mcmc_iter + 1, self.burnin, self.mean, np.sqrt(self.amp2), self.noise, np.min( self.ls), np.max(self.ls))) self.needs_burnin = False # Sample from hyperparameters. # Adjust the candidates to hit ei peaks self.hyper_samples = [] for mcmc_iter in range(self.mcmc_iters): self.sample_hypers(comp, vals) log("%d/%d] mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f" % (mcmc_iter + 1, self.mcmc_iters, self.mean, np.sqrt(self.amp2), self.noise, np.min( self.ls), np.max(self.ls))) self.dump_hypers() b = [] # optimization bounds for i in range(0, cand.shape[1]): b.append((0, 1)) overall_ei = self.ei_over_hypers(comp, pend, cand2, vals) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] cand2 = cand2[inds, :] # Optimize each point in parallel if self.use_multiprocessing: pool = multiprocessing.Pool(self.grid_subset) results = [ pool.apply_async(optimize_pt, args=(c, b, comp, pend, vals, copy.copy(self))) for c in cand2 ] for res in results: cand = np.vstack((cand, res.get(1e8))) pool.close() else: # This is old code to optimize each point in parallel. for i in range(0, cand2.shape[0]): log("Optimizing candidate %d/%d" % (i + 1, cand2.shape[0])) #self.check_grad_ei(cand2[i,:].flatten(), comp, pend, vals) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers, cand2[i, :].flatten(), args=(comp, pend, vals), bounds=b, disp=0) cand2[i, :] = ret[0] cand = np.vstack((cand, cand2)) overall_ei = self.ei_over_hypers(comp, pend, cand, vals) best_cand = np.argmax(np.mean(overall_ei, axis=1)) if (best_cand >= numcand): return (int(numcand), cand[best_cand, :]) return int(candidates[best_cand]) else: # Optimize hyperparameters self.optimize_hypers(comp, vals) log("mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f" % (self.mean, np.sqrt( self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) # Optimize over EI b = [] # optimization bounds for i in range(0, cand.shape[1]): b.append((0, 1)) for i in range(0, cand2.shape[0]): ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei, cand2[i, :].flatten(), args=(comp, vals, True), bounds=b, disp=0) cand2[i, :] = ret[0] cand = np.vstack((cand, cand2)) ei = self.compute_ei(comp, pend, cand, vals) best_cand = np.argmax(ei) if (best_cand >= numcand): return (int(numcand), cand[best_cand, :]) return int(candidates[best_cand]) # Compute EI over hyperparameter samples def ei_over_hypers(self, comp, pend, cand, vals): overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in range(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] overall_ei[:, mcmc_iter] = self.compute_ei(comp, pend, cand, vals) return overall_ei def check_grad_ei(self, cand, comp, pend, vals): (ei, dx1) = self.grad_optimize_ei_over_hypers(cand, comp, pend, vals) dx2 = dx1 * 0 idx = np.zeros(cand.shape[0]) for i in range(0, cand.shape[0]): idx[i] = 1e-6 (ei1, tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, pend, vals) (ei2, tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, pend, vals) dx2[i] = (ei - ei2) / (2 * 1e-6) idx[i] = 0 print('computed grads', dx1) print('finite diffs', dx2) print((dx1 / dx2)) print(np.sum((dx1 - dx2)**2)) time.sleep(2) # Adjust points by optimizing EI over a set of hyperparameter samples def grad_optimize_ei_over_hypers(self, cand, comp, pend, vals, compute_grad=True): summed_ei = 0 summed_grad_ei = np.zeros(cand.shape).flatten() ls = self.ls.copy() amp2 = self.amp2 mean = self.mean noise = self.noise for hyper in self.hyper_samples: self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] if compute_grad: (ei, g_ei) = self.grad_optimize_ei(cand, comp, pend, vals, compute_grad) summed_grad_ei = summed_grad_ei + g_ei else: ei = self.grad_optimize_ei(cand, comp, pend, vals, compute_grad) summed_ei += ei self.mean = mean self.amp2 = amp2 self.noise = noise self.ls = ls.copy() if compute_grad: return (summed_ei, summed_grad_ei) else: return summed_ei # Adjust points based on optimizing their ei def grad_optimize_ei(self, cand, comp, pend, vals, compute_grad=True): if pend.shape[0] == 0: best = np.min(vals) cand = np.reshape(cand, (-1, comp.shape[1])) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.ls, comp, cand) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) if not compute_grad: return ei # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5 * npdf / func_s # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(), grad_cross) grad_xp_v = np.dot( -2 * spla.cho_solve((obsv_chol, True), cand_cross).transpose(), grad_cross) grad_xp = 0.5 * self.amp2 * (grad_xp_m * g_ei_m + grad_xp_v * g_ei_s2) ei = -np.sum(ei) return ei, grad_xp.flatten() else: # If there are pending experiments, fantasize their outcomes. cand = np.reshape(cand, (-1, comp.shape[1])) # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = (self.cov(comp_pend) + self.noise * np.eye(comp_pend.shape[0])) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. npr.set_state(self.randomstate) pend_fant = np.dot( pend_chol, npr.randn(pend.shape[0], self.pending_samples)) + pend_m[:, None] # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:, np.newaxis], (1, self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.ls, comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:, np.newaxis]) u = (bests[np.newaxis, :] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5 * npdf / func_s # Apply covariance function # Squeeze can break the 1D case be careful if pend.shape[1] == 1: grad_cross = np.squeeze(cand_cross_grad, axis=(2, )) else: grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(), grad_cross) grad_xp_v = np.dot( -2 * spla.cho_solve( (comp_pend_chol, True), cand_cross).transpose(), grad_cross) grad_xp = 0.5 * self.amp2 * ( grad_xp_m * np.tile(g_ei_m, (comp.shape[1], 1)).T + (grad_xp_v.T * g_ei_s2).T) ei = -np.mean(ei, axis=1) grad_xp = np.mean(grad_xp, axis=0) return ei, grad_xp.flatten() def compute_ei(self, comp, pend, cand, vals): if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return ei else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = (self.cov(comp_pend) + self.noise * np.eye(comp_pend.shape[0])) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. npr.set_state(self.randomstate) pend_fant = np.dot( pend_chol, npr.randn(pend.shape[0], self.pending_samples)) + pend_m[:, None] # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:, np.newaxis], (1, self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:, np.newaxis]) u = (bests[np.newaxis, :] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return np.mean(ei, axis=1) def sample_hypers(self, comp, vals): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = ( self.amp2 * (self.cov_func(ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + self.noise * np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = (-np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(vals - self.mean, solve)) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = (amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + noise * np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale / noise)**2)) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(np.sqrt(amp2)) / self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0: return -np.inf cov = (amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + noise * np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(np.sqrt(amp2)) / self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def optimize_hypers(self, comp, vals): mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp, vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Save hyperparameter samples self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.dump_hypers() return
class GPEIperSecChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join( expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.time_hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 10 # top-hat prior on length scales self.time_noise_scale = 0.1 # horseshoe prior self.time_amp2_scale = 1 # zero-mean log normal prior self.time_max_ls = 10 # top-hat prior on length scales # A simple function to dump out hyperparameters to allow for a hot start # if the optimization is restarted. def dump_hypers(self): self.locker.lock_wait(self.state_pkl) # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump( { 'dims': self.D, 'ls': self.ls, 'amp2': self.amp2, 'noise': self.noise, 'mean': self.mean, 'time_ls': self.time_ls, 'time_amp2': self.time_amp2, 'time_noise': self.time_noise, 'time_mean': self.time_mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) def _real_init(self, dims, values, durations): self.locker.lock_wait(self.state_pkl) if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.time_ls = state['time_ls'] self.time_amp2 = state['time_amp2'] self.time_noise = state['time_noise'] self.time_mean = state['time_mean'] else: # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) self.time_ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values) + 1e-4 self.time_amp2 = np.std(durations) + 1e-4 # Initial observation noise. self.noise = 1e-3 self.time_noise = 1e-3 # Initial mean. self.mean = np.mean(values) self.time_mean = np.mean(np.log(durations)) self.locker.unlock(self.state_pkl) def cov(self, amp2, ls, x1, x2=None): if x2 is None: return amp2 * (self.cov_func(ls, x1, None) + 1e-6 * np.eye(x1.shape[0])) else: return amp2 * self.cov_func(ls, x1, x2) # Given a set of completed 'experiments' in the unit hypercube with # corresponding objective 'values', pick from the next experiment to # run according to the acquisition function. def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete], durations[complete]) # Grab out the relevant sets. comp = grid[complete, :] cand = grid[candidates, :] pend = grid[pending, :] vals = values[complete] durs = durations[complete] # Bring time into the log domain before we do anything # to maintain strict positivity durs = np.log(durs) # Spray a set of candidates around the min so far numcand = cand.shape[0] best_comp = np.argmin(vals) cand2 = np.vstack( (np.random.randn(10, comp.shape[1]) * 0.001 + comp[best_comp, :], cand)) if self.mcmc_iters > 0: # Possibly burn in. if self.needs_burnin: for mcmc_iter in range(self.burnin): self.sample_hypers(comp, vals, durs) log("BURN %d/%d] mean: %.2f amp: %.2f " "noise: %.4f min_ls: %.4f max_ls: %.4f" % (mcmc_iter + 1, self.burnin, self.mean, np.sqrt(self.amp2), self.noise, np.min( self.ls), np.max(self.ls))) self.needs_burnin = False # Sample from hyperparameters. # Adjust the candidates to hit ei/sec peaks self.hyper_samples = [] for mcmc_iter in range(self.mcmc_iters): self.sample_hypers(comp, vals, durs) log("%d/%d] mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f" % (mcmc_iter + 1, self.mcmc_iters, self.mean, np.sqrt(self.amp2), self.noise, np.min( self.ls), np.max(self.ls))) log("%d/%d] time_mean: %.2fs time_amp: %.2f time_noise: %.4f " "time_min_ls: %.4f time_max_ls: %.4f" % (mcmc_iter + 1, self.mcmc_iters, np.exp(self.time_mean), np.sqrt(self.time_amp2), np.exp(self.time_noise), np.min(self.time_ls), np.max(self.time_ls))) self.dump_hypers() # Pick the top candidates to optimize over overall_ei = self.ei_over_hypers(comp, pend, cand2, vals, durs) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] cand2 = cand2[inds, :] # Adjust the candidates to hit ei peaks b = [] # optimization bounds for i in range(0, cand.shape[1]): b.append((0, 1)) for i in range(0, cand2.shape[0]): log("Optimizing candidate %d/%d" % (i + 1, cand2.shape[0])) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers, cand2[i, :].flatten(), args=(comp, vals, durs, True), bounds=b, disp=0) cand2[i, :] = ret[0] cand = np.vstack((cand, cand2)) overall_ei = self.ei_over_hypers(comp, pend, cand, vals, durs) best_cand = np.argmax(np.mean(overall_ei, axis=1)) self.dump_hypers() if (best_cand >= numcand): return (int(numcand), cand[best_cand, :]) return int(candidates[best_cand]) else: # Optimize hyperparameters self.optimize_hypers(comp, vals, durs) log("mean: %f amp: %f noise: %f " "min_ls: %f max_ls: %f" % (self.mean, np.sqrt( self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) # Pick the top candidates to optimize over ei = self.compute_ei_per_s(comp, pend, cand2, vals, durs) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] cand2 = cand2[inds, :] # Adjust the candidates to hit ei peaks b = [] # optimization bounds for i in range(0, cand.shape[1]): b.append((0, 1)) for i in range(0, cand2.shape[0]): log("Optimizing candidate %d/%d" % (i + 1, cand2.shape[0])) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei, cand2[i, :].flatten(), args=(comp, vals, durs, True), bounds=b, disp=0) cand2[i, :] = ret[0] cand = np.vstack((cand, cand2)) ei = self.compute_ei_per_s(comp, pend, cand, vals, durs) best_cand = np.argmax(ei) self.dump_hypers() if (best_cand >= numcand): return (int(numcand), cand[best_cand, :]) return int(candidates[best_cand]) # Compute EI over hyperparameter samples def ei_over_hypers(self, comp, pend, cand, vals, durs): overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in range(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] time_hyper = self.time_hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] self.time_mean = time_hyper[0] self.time_noise = time_hyper[1] self.time_amp2 = time_hyper[2] self.time_ls = time_hyper[3] overall_ei[:, mcmc_iter] = self.compute_ei_per_s( comp, pend, cand, vals, durs.squeeze()) return overall_ei def check_grad_ei_per(self, cand, comp, vals, durs): (ei, dx1) = self.grad_optimize_ei_over_hypers(cand, comp, vals, durs) dx2 = dx1 * 0 idx = np.zeros(cand.shape[0]) for i in range(0, cand.shape[0]): idx[i] = 1e-6 (ei1, tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, vals, durs) (ei2, tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, vals, durs) dx2[i] = (ei - ei2) / (2 * 1e-6) idx[i] = 0 print('computed grads', dx1) print('finite diffs', dx2) print(dx1 / dx2) print(np.sum((dx1 - dx2)**2)) time.sleep(2) # Adjust points by optimizing EI over a set of hyperparameter samples def grad_optimize_ei_over_hypers(self, cand, comp, vals, durs, compute_grad=True): summed_ei = 0 summed_grad_ei = np.zeros(cand.shape).flatten() for mcmc_iter in range(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] time_hyper = self.time_hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] self.time_mean = time_hyper[0] self.time_noise = time_hyper[1] self.time_amp2 = time_hyper[2] self.time_ls = time_hyper[3] if compute_grad: (ei, g_ei) = self.grad_optimize_ei(cand, comp, vals, durs, compute_grad) summed_grad_ei = summed_grad_ei + g_ei else: ei = self.grad_optimize_ei(cand, comp, vals, durs, compute_grad) summed_ei += ei if compute_grad: return (summed_ei, summed_grad_ei) else: return summed_ei def grad_optimize_ei(self, cand, comp, vals, durs, compute_grad=True): # Here we have to compute the gradients for ei per second # This means deriving through the two kernels, the one for predicting # time and the one predicting ei best = np.min(vals) cand = np.reshape(cand, (-1, comp.shape[1])) # First we make predictions for the durations # Compute covariances comp_time_cov = self.cov(self.time_amp2, self.time_ls, comp) cand_time_cross = self.cov(self.time_amp2, self.time_ls, comp, cand) # Cholesky decompositions obsv_time_cov = comp_time_cov + self.time_noise * np.eye(comp.shape[0]) obsv_time_chol = spla.cholesky(obsv_time_cov, lower=True) # Linear systems t_alpha = spla.cho_solve((obsv_time_chol, True), durs - self.time_mean) # Predict marginal mean times and (possibly) variances func_time_m = np.dot(cand_time_cross.T, t_alpha) + self.time_mean # We don't really need the time variances now #func_time_v = self.time_amp2*(1+1e-6) - np.sum(t_beta**2, axis=0) # Bring time out of the log domain func_time_m = np.exp(func_time_m) # Compute derivative of cross-distances. grad_cross_r = gp.grad_dist2(self.time_ls, comp, cand) # Apply covariance function cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.time_ls, comp, cand) grad_cross_t = np.squeeze(cand_cross_grad) # Now compute the gradients w.r.t. ei # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) cand_cross = self.cov(self.amp2, self.ls, comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) cand_cross_grad = cov_grad_func(self.ls, comp, cand) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) ei_per_s = -np.sum(ei / func_time_m) if not compute_grad: return ei grad_time_xp_m = np.dot(t_alpha.transpose(), grad_cross_t) # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5 * npdf / func_s # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(), grad_cross) grad_xp_v = np.dot( -2 * spla.cho_solve((obsv_chol, True), cand_cross).transpose(), grad_cross) grad_xp = 0.5 * self.amp2 * (grad_xp_m * g_ei_m + grad_xp_v * g_ei_s2) grad_time_xp_m = 0.5 * self.time_amp2 * grad_time_xp_m * func_time_m grad_xp = (func_time_m * grad_xp - ei * grad_time_xp_m) / (func_time_m **2) return ei_per_s, grad_xp.flatten() def compute_ei_per_s(self, comp, pend, cand, vals, durs): # First we make predictions for the durations as that # doesn't depend on pending experiments # Compute covariances comp_time_cov = self.cov(self.time_amp2, self.time_ls, comp) cand_time_cross = self.cov(self.time_amp2, self.time_ls, comp, cand) # Cholesky decompositions obsv_time_cov = comp_time_cov + self.time_noise * np.eye(comp.shape[0]) obsv_time_chol = spla.cholesky(obsv_time_cov, lower=True) # Linear systems t_alpha = spla.cho_solve((obsv_time_chol, True), durs - self.time_mean) #t_beta = spla.solve_triangular(obsv_time_chol, cand_time_cross, lower=True) # Predict marginal mean times and (possibly) variances func_time_m = np.dot(cand_time_cross.T, t_alpha) + self.time_mean # We don't really need the time variances now #func_time_v = self.time_amp2*(1+1e-6) - np.sum(t_beta**2, axis=0) # Bring time out of the log domain func_time_m = np.exp(func_time_m) if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) cand_cross = self.cov(self.amp2, self.ls, comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) ei_per_s = ei / func_time_m return ei_per_s else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = self.cov( self.amp2, self.ls, comp_pend) + self.noise * np.eye(comp_pend.shape[0]) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(self.amp2, self.ls, comp, pend) pend_kappa = self.cov(self.amp2, self.ls, pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. pend_fant = np.dot( pend_chol, npr.randn(pend.shape[0], self.pending_samples)) + pend_m[:, None] # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:, np.newaxis], (1, self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(self.amp2, self.ls, comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:, np.newaxis]) u = (bests[np.newaxis, :] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return np.divide(np.mean(ei, axis=1), func_time_m) def sample_hypers(self, comp, vals, durs): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) self._sample_time_noisy(comp, durs.squeeze()) self._sample_time_ls(comp, durs.squeeze()) self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.time_hyper_samples.append( (self.time_mean, self.time_noise, self.time_amp2, self.time_ls)) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = self.amp2 * (self.cov_func(ls, comp, None) + 1e-6 * np.eye( comp.shape[0])) + self.noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - self.mean, solve) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_time_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.time_max_ls): return -np.inf cov = self.time_amp2 * (self.cov_func(ls, comp, None) + 1e-6 * np.eye(comp.shape[0]) ) + self.time_noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.time_mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - self.time_mean, solve) return lp self.time_ls = util.slice_sample(self.time_ls, logprob, compwise=True) def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye( comp.shape[0])) + noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale / noise)**2)) #lp -= 0.5*(np.log(noise)/self.noise_scale)**2 # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(amp2) / self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_time_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = amp2 * (self.cov_func(self.time_ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + noise * np.eye( comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.time_noise_scale / noise)**2)) #lp -= 0.5*(np.log(noise)/self.time_noise_scale)**2 # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(np.sqrt(amp2)) / self.time_amp2_scale)**2 return lp hypers = util.slice_sample(np.array( [self.time_mean, self.time_amp2, self.time_noise]), logprob, compwise=False) self.time_mean = hypers[0] self.time_amp2 = hypers[1] self.time_noise = hypers[2] def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye( comp.shape[0])) + noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(amp2) / self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def optimize_hypers(self, comp, vals, durs): # First the GP to observations mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp, vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Now the GP to times timegp = gp.GP(self.cov_func.__name__) timegp.real_init(comp.shape[1], durs) timegp.optimize_hypers(comp, durs) self.time_mean = timegp.mean self.time_amp2 = timegp.amp2 self.time_noise = timegp.noise self.time_ls = timegp.ls # Save hyperparameter samples self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.time_hyper_samples.append( (self.time_mean, self.time_noise, self.time_amp2, self.time_ls)) self.dump_hypers()
class GPEIOptChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20, use_multiprocessing=True): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join(expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = int(pending_samples) self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales # If multiprocessing fails or deadlocks, set this to False self.use_multiprocessing = bool(int(use_multiprocessing)) def dump_hypers(self): self.locker.lock_wait(self.state_pkl) # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump({ 'dims' : self.D, 'ls' : self.ls, 'amp2' : self.amp2, 'noise' : self.noise, 'hyper_samples' : self.hyper_samples, 'mean' : self.mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) # Write the hyperparameters out to a human readable file as well fh = open(self.stats_file, 'w') fh.write('Mean Noise Amplitude <length scales>\n') fh.write('-----------ALL SAMPLES-------------\n') meanhyps = 0*np.hstack(self.hyper_samples[0]) for i in self.hyper_samples: hyps = np.hstack(i) meanhyps += (1/float(len(self.hyper_samples)))*hyps for j in hyps: fh.write(str(j) + ' ') fh.write('\n') fh.write('-----------MEAN OF SAMPLES-------------\n') for j in meanhyps: fh.write(str(j) + ' ') fh.write('\n') fh.close() # This passes out html or javascript to display interesting # stats - such as the length scales (sensitivity to various # dimensions). def generate_stats_html(self): # Need this because the model may not necessarily be # initialized when this code is called. if not self._read_only(): return 'Chooser not yet ready to display output' mean_mean = np.mean(np.vstack([h[0] for h in self.hyper_samples])) mean_noise = np.mean(np.vstack([h[1] for h in self.hyper_samples])) mean_ls = np.mean(np.vstack([h[3][np.newaxis,:] for h in self.hyper_samples]),0) try: output = ( '<br /><span class=\"label label-info\">Estimated mean:</span> ' + str(mean_mean) + '<br /><span class=\"label label-info\">Estimated noise:</span> ' + str(mean_noise) + '<br /><br /><span class=\"label label-info\">Inverse parameter sensitivity' + ' - Gaussian Process length scales</span><br /><br />' + '<div id=\"lschart\"></div><script type=\"text/javascript\">' + 'var lsdata = [' + ','.join(['%.2f' % i for i in mean_ls]) + '];') except: return 'Chooser not yet ready to display output.' output += ('bar_chart("#lschart", lsdata, ' + str(self.max_ls) + ');' + '</script>') return output # Read in the chooser from file. Returns True only on success def _read_only(self): if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.hyper_samples = state['hyper_samples'] self.needs_burnin = False return True return False def _real_init(self, dims, values): self.locker.lock_wait(self.state_pkl) self.randomstate = npr.get_state() if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.hyper_samples = state['hyper_samples'] self.needs_burnin = False else: # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values)+1e-4 # Initial observation noise. self.noise = 1e-3 # Initial mean. self.mean = np.mean(values) # Save hyperparameter samples self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.locker.unlock(self.state_pkl) def cov(self, x1, x2=None): if x2 is None: return self.amp2 * (self.cov_func(self.ls, x1, None) + 1e-6*np.eye(x1.shape[0])) else: return self.amp2 * self.cov_func(self.ls, x1, x2) # Given a set of completed 'experiments' in the unit hypercube with # corresponding objective 'values', pick from the next experiment to # run according to the acquisition function. def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete]) # Grab out the relevant sets. comp = grid[complete,:] cand = grid[candidates,:] pend = grid[pending,:] vals = values[complete] numcand = cand.shape[0] # Spray a set of candidates around the min so far best_comp = np.argmin(vals) cand2 = np.vstack((np.random.randn(10,comp.shape[1])*0.001 + comp[best_comp,:], cand)) if self.mcmc_iters > 0: # Possibly burn in. if self.needs_burnin: for mcmc_iter in xrange(self.burnin): self.sample_hypers(comp, vals) log("BURN %d/%d] mean: %.2f amp: %.2f " "noise: %.4f min_ls: %.4f max_ls: %.4f" % (mcmc_iter+1, self.burnin, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) self.needs_burnin = False # Sample from hyperparameters. # Adjust the candidates to hit ei peaks self.hyper_samples = [] for mcmc_iter in xrange(self.mcmc_iters): self.sample_hypers(comp, vals) log("%d/%d] mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f" % (mcmc_iter+1, self.mcmc_iters, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) self.dump_hypers() b = []# optimization bounds for i in xrange(0, cand.shape[1]): b.append((0, 1)) overall_ei = self.ei_over_hypers(comp,pend,cand2,vals) inds = np.argsort(np.mean(overall_ei,axis=1))[-self.grid_subset:] cand2 = cand2[inds,:] # Optimize each point in parallel if self.use_multiprocessing: pool = multiprocessing.Pool(self.grid_subset) results = [pool.apply_async(optimize_pt,args=( c,b,comp,pend,vals,copy.copy(self))) for c in cand2] for res in results: cand = np.vstack((cand, res.get(1e8))) pool.close() else: # This is old code to optimize each point in parallel. for i in xrange(0, cand2.shape[0]): log("Optimizing candidate %d/%d" % (i+1, cand2.shape[0])) #self.check_grad_ei(cand2[i,:].flatten(), comp, pend, vals) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers, cand2[i,:].flatten(), args=(comp,pend,vals), bounds=b, disp=0) cand2[i,:] = ret[0] cand = np.vstack((cand, cand2)) overall_ei = self.ei_over_hypers(comp,pend,cand,vals) best_cand = np.argmax(np.mean(overall_ei, axis=1)) if (best_cand >= numcand): return (int(numcand), cand[best_cand,:]) return int(candidates[best_cand]) else: # Optimize hyperparameters self.optimize_hypers(comp, vals) log("mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f" % (self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) # Optimize over EI b = []# optimization bounds for i in xrange(0, cand.shape[1]): b.append((0, 1)) for i in xrange(0, cand2.shape[0]): ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei, cand2[i,:].flatten(), args=(comp,vals,True), bounds=b, disp=0) cand2[i,:] = ret[0] cand = np.vstack((cand, cand2)) ei = self.compute_ei(comp, pend, cand, vals) best_cand = np.argmax(ei) if (best_cand >= numcand): return (int(numcand), cand[best_cand,:]) return int(candidates[best_cand]) # Compute EI over hyperparameter samples def ei_over_hypers(self,comp,pend,cand,vals): overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in xrange(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] overall_ei[:,mcmc_iter] = self.compute_ei(comp, pend, cand, vals) return overall_ei def check_grad_ei(self, cand, comp, pend, vals): (ei,dx1) = self.grad_optimize_ei_over_hypers(cand, comp, pend, vals) dx2 = dx1*0 idx = np.zeros(cand.shape[0]) for i in xrange(0, cand.shape[0]): idx[i] = 1e-6 (ei1,tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, pend, vals) (ei2,tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, pend, vals) dx2[i] = (ei - ei2)/(2*1e-6) idx[i] = 0 print 'computed grads', dx1 print 'finite diffs', dx2 print (dx1/dx2) print np.sum((dx1 - dx2)**2) time.sleep(2) # Adjust points by optimizing EI over a set of hyperparameter samples def grad_optimize_ei_over_hypers(self, cand, comp, pend, vals, compute_grad=True): summed_ei = 0 summed_grad_ei = np.zeros(cand.shape).flatten() ls = self.ls.copy() amp2 = self.amp2 mean = self.mean noise = self.noise for hyper in self.hyper_samples: self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] if compute_grad: (ei,g_ei) = self.grad_optimize_ei(cand,comp,pend,vals,compute_grad) summed_grad_ei = summed_grad_ei + g_ei else: ei = self.grad_optimize_ei(cand,comp,pend,vals,compute_grad) summed_ei += ei self.mean = mean self.amp2 = amp2 self.noise = noise self.ls = ls.copy() if compute_grad: return (summed_ei, summed_grad_ei) else: return summed_ei # Adjust points based on optimizing their ei def grad_optimize_ei(self, cand, comp, pend, vals, compute_grad=True): if pend.shape[0] == 0: best = np.min(vals) cand = np.reshape(cand, (-1, comp.shape[1])) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.ls, comp, cand) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) if not compute_grad: return ei # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5*npdf / func_s # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(),grad_cross) grad_xp_v = np.dot(-2*spla.cho_solve( (obsv_chol, True),cand_cross).transpose(), grad_cross) grad_xp = 0.5*self.amp2*(grad_xp_m*g_ei_m + grad_xp_v*g_ei_s2) ei = -np.sum(ei) return ei, grad_xp.flatten() else: # If there are pending experiments, fantasize their outcomes. cand = np.reshape(cand, (-1, comp.shape[1])) # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = (self.cov(comp_pend) + self.noise*np.eye(comp_pend.shape[0])) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0],:comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. npr.set_state(self.randomstate) pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0],self.pending_samples)) + pend_m[:,None] # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:,np.newaxis], (1,self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.ls, comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:,np.newaxis]) u = (bests[np.newaxis,:] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5*npdf / func_s # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(),grad_cross) grad_xp_v = np.dot(-2*spla.cho_solve( (comp_pend_chol, True),cand_cross).transpose(), grad_cross) grad_xp = 0.5*self.amp2*(grad_xp_m*np.tile(g_ei_m,(comp.shape[1],1)).T + (grad_xp_v.T*g_ei_s2).T) ei = -np.mean(ei, axis=1) grad_xp = np.mean(grad_xp,axis=0) return ei, grad_xp.flatten() def compute_ei(self, comp, pend, cand, vals): if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky( obsv_cov, lower=True ) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) return ei else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = (self.cov(comp_pend) + self.noise*np.eye(comp_pend.shape[0])) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0],:comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. npr.set_state(self.randomstate) pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0],self.pending_samples)) + pend_m[:,None] # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:,np.newaxis], (1,self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:,np.newaxis]) u = (bests[np.newaxis,:] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) return np.mean(ei, axis=1) def sample_hypers(self, comp, vals): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = (self.amp2 * (self.cov_func(ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.noise*np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = (-np.sum(np.log(np.diag(chol))) - 0.5*np.dot(vals-self.mean, solve)) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = (amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale/noise)**2)) # Roll in amplitude lognormal prior lp -= 0.5*(np.log(np.sqrt(amp2))/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array( [self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0: return -np.inf cov = (amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in amplitude lognormal prior lp -= 0.5*(np.log(np.sqrt(amp2))/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array( [self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def optimize_hypers(self, comp, vals): mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp,vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Save hyperparameter samples self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.dump_hypers() return
class GPEIChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.mcmc_iters = int(mcmc_iters) self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 self.noiseless = bool(int(noiseless)) self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales def __del__(self): self.locker.lock_wait(self.state_pkl) # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='wb', delete=False) cPickle.dump({ 'dims' : self.D, 'ls' : self.ls, 'amp2' : self.amp2, 'noise' : self.noise, 'mean' : self.mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) def _real_init(self, dims, values): self.locker.lock_wait(self.state_pkl) if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'rb') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] else: # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values)+1e-4 # Initial observation noise. self.noise = 1e-3 # Initial mean. self.mean = np.mean(values) self.locker.unlock(self.state_pkl) def cov(self, x1, x2=None): if x2 is None: return self.amp2 * (self.cov_func(self.ls, x1, None) + 1e-6*np.eye(x1.shape[0])) else: return self.amp2 * self.cov_func(self.ls, x1, x2) def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete]) # Grab out the relevant sets. comp = grid[complete,:] cand = grid[candidates,:] pend = grid[pending,:] vals = values[complete] if self.mcmc_iters > 0: # Sample from hyperparameters. overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in range(self.mcmc_iters): self.sample_hypers(comp, vals) log("mean: %f amp: %f noise: %f min_ls: %f max_ls: %f" % (self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) overall_ei[:,mcmc_iter] = self.compute_ei(comp, pend, cand, vals) best_cand = np.argmax(np.mean(overall_ei, axis=1)) log(f"Max mean EI: {max(np.mean(overall_ei, axis=1))}") return int(candidates[best_cand]) else: # Optimize hyperparameters try: self.optimize_hypers(comp, vals) except: # Initial length scales. self.ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(vals) # Initial observation noise. self.noise = 1e-3 log("mean: %f amp: %f noise: %f min_ls: %f max_ls: %f" % (self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) ei = self.compute_ei(comp, pend, cand, vals) log(f"Max EI: {max(ei)}") best_cand = np.argmax(ei) return int(candidates[best_cand]) def compute_ei(self, comp, pend, cand, vals): if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky( obsv_cov, lower=True ) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) return ei else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = self.cov(comp_pend) + self.noise*np.eye(comp_pend.shape[0]) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0],:comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. pend_fant = (np.dot(pend_chol, npr.randn(pend.shape[0],self.pending_samples)) + pend_m[:,None]) # Include the fantasies. fant_vals = np.concatenate((np.tile(vals[:,np.newaxis], (1,self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:,np.newaxis]) u = (bests[np.newaxis,:] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) return np.mean(ei, axis=1) def sample_hypers(self, comp, vals): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = self.amp2 * (self.cov_func(ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-self.mean, solve) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale/noise)**2)) # Roll in amplitude lognormal prior lp -= 0.5*(np.log(amp2)/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 if amp2 < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in amplitude lognormal prior lp -= 0.5*(np.log(amp2)/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def optimize_hypers(self, comp, vals): mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp,vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Save hyperparameter samples #self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) #self.dump_hypers() return