Exemplo n.º 1
0
    def __init__(self,
                 expt_dir,
                 covar="Matern52",
                 mcmc_iters=10,
                 pending_samples=100,
                 noiseless=False,
                 burnin=100,
                 grid_subset=20,
                 use_multiprocessing=True):
        self.cov_func = getattr(gp, covar)
        self.locker = Locker()
        self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl")
        self.stats_file = os.path.join(
            expt_dir, self.__module__ + "_hyperparameters.txt")
        self.mcmc_iters = int(mcmc_iters)
        self.burnin = int(burnin)
        self.needs_burnin = True
        self.pending_samples = int(pending_samples)
        self.D = -1
        self.hyper_iters = 1
        # Number of points to optimize EI over
        self.grid_subset = int(grid_subset)
        self.noiseless = bool(int(noiseless))
        self.hyper_samples = []

        self.noise_scale = 0.1  # horseshoe prior
        self.amp2_scale = 1  # zero-mean log normal prior
        self.max_ls = 2  # top-hat prior on length scales

        # If multiprocessing fails or deadlocks, set this to False
        self.use_multiprocessing = bool(int(use_multiprocessing))
Exemplo n.º 2
0
    def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10,
                 pending_samples=100, noiseless=False):
        self.cov_func        = getattr(gp, covar)
        self.locker          = Locker()
        self.state_pkl       = os.path.join(expt_dir, self.__module__ + ".pkl")

        self.mcmc_iters      = int(mcmc_iters)
        self.pending_samples = pending_samples
        self.D               = -1
        self.hyper_iters     = 1
        self.noiseless       = bool(int(noiseless))

        self.noise_scale = 0.1  # horseshoe prior
        self.amp2_scale  = 1    # zero-mean log normal prior
        self.max_ls      = 2    # top-hat prior on length scales
Exemplo n.º 3
0
    def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10,
                 pending_samples=100, noiseless=False, burnin=100,
                 grid_subset=20, use_multiprocessing=True):
        self.cov_func        = getattr(gp, covar)
        self.locker          = Locker()
        self.state_pkl       = os.path.join(expt_dir, self.__module__ + ".pkl")
        self.stats_file      = os.path.join(expt_dir,
                                   self.__module__ + "_hyperparameters.txt")
        self.mcmc_iters      = int(mcmc_iters)
        self.burnin          = int(burnin)
        self.needs_burnin    = True
        self.pending_samples = int(pending_samples)
        self.D               = -1
        self.hyper_iters     = 1
        # Number of points to optimize EI over
        self.grid_subset     = int(grid_subset)
        self.noiseless       = bool(int(noiseless))
        self.hyper_samples = []

        self.noise_scale = 0.1  # horseshoe prior
        self.amp2_scale  = 1    # zero-mean log normal prior
        self.max_ls      = 2    # top-hat prior on length scales

        # If multiprocessing fails or deadlocks, set this to False
        self.use_multiprocessing = bool(int(use_multiprocessing))
Exemplo n.º 4
0
    def __init__(self, expt_dir, variables=None, grid_size=None, grid_seed=1):
        self.expt_dir = expt_dir
        self.jobs_pkl = os.path.join(expt_dir, EXPERIMENT_GRID_FILE)
        self.locker = Locker()

        # Only one process at a time is allowed to have access to the grid.
        self.locker.lock_wait(self.jobs_pkl)

        # Set up the grid for the first time if it doesn't exist.
        if variables is not None and not os.path.exists(self.jobs_pkl):
            self.seed = grid_seed
            self.vmap = GridMap(variables, grid_size)
            self.grid = self._hypercube_grid(self.vmap.card(), grid_size)
            self.status = np.zeros(grid_size, dtype=int) + CANDIDATE_STATE
            self.values = np.zeros(grid_size) + np.nan
            self.durs = np.zeros(grid_size) + np.nan
            self.proc_ids = np.zeros(grid_size, dtype=int)
            self._save_jobs()

        # Or load in the grid from the pickled file.
        else:
            self._load_jobs()
Exemplo n.º 5
0
class ExperimentGrid:
    @staticmethod
    def job_running(expt_dir, id):
        expt_grid = ExperimentGrid(expt_dir)
        expt_grid.set_running(id)

    @staticmethod
    def job_complete(expt_dir, id, value, duration):
        log("setting job %d complete" % id)
        expt_grid = ExperimentGrid(expt_dir)
        expt_grid.set_complete(id, value, duration)
        log("set...")

    @staticmethod
    def job_broken(expt_dir, id):
        expt_grid = ExperimentGrid(expt_dir)
        expt_grid.set_broken(id)

    def __init__(self, expt_dir, variables=None, grid_size=None, grid_seed=1):
        self.expt_dir = expt_dir
        self.jobs_pkl = os.path.join(expt_dir, EXPERIMENT_GRID_FILE)
        self.locker = Locker()

        # Only one process at a time is allowed to have access to the grid.
        self.locker.lock_wait(self.jobs_pkl)

        # Set up the grid for the first time if it doesn't exist.
        if variables is not None and not os.path.exists(self.jobs_pkl):
            self.seed = grid_seed
            self.vmap = GridMap(variables, grid_size)
            self.grid = self._hypercube_grid(self.vmap.card(), grid_size)
            self.status = np.zeros(grid_size, dtype=int) + CANDIDATE_STATE
            self.values = np.zeros(grid_size) + np.nan
            self.durs = np.zeros(grid_size) + np.nan
            self.proc_ids = np.zeros(grid_size, dtype=int)
            self._save_jobs()

        # Or load in the grid from the pickled file.
        else:
            self._load_jobs()

    def __del__(self):
        self._save_jobs()
        if self.locker.unlock(self.jobs_pkl):
            pass
        else:
            raise Exception("Could not release lock on job grid.\n")

    def get_grid(self):
        return self.grid, self.values, self.durs

    def get_candidates(self):
        return np.nonzero(self.status == CANDIDATE_STATE)[0]

    def get_pending(self):
        return np.nonzero((self.status == SUBMITTED_STATE)
                          | (self.status == RUNNING_STATE))[0]

    def get_complete(self):
        return np.nonzero(self.status == COMPLETE_STATE)[0]

    def get_broken(self):
        return np.nonzero(self.status == BROKEN_STATE)[0]

    def get_params(self, index):
        return self.vmap.get_params(self.grid[index, :])

    def get_best(self):
        finite = self.values[np.isfinite(self.values)]
        if len(finite) > 0:
            cur_min = np.min(finite)
            index = np.nonzero(self.values == cur_min)[0][0]
            return cur_min, index
        else:
            return np.nan, -1

    def get_proc_id(self, id):
        return self.proc_ids[id]

    def add_to_grid(self, candidate):
        # Checks to prevent numerical over/underflow from corrupting the grid
        candidate[candidate > 1.0] = 1.0
        candidate[candidate < 0.0] = 0.0

        # Set up the grid
        self.grid = np.vstack((self.grid, candidate))
        self.status = np.append(self.status,
                                np.zeros(1, dtype=int) + int(CANDIDATE_STATE))

        self.values = np.append(self.values, np.zeros(1) + np.nan)
        self.durs = np.append(self.durs, np.zeros(1) + np.nan)
        self.proc_ids = np.append(self.proc_ids, np.zeros(1, dtype=int))

        # Save this out.
        self._save_jobs()
        return self.grid.shape[0] - 1

    def set_candidate(self, id):
        self.status[id] = CANDIDATE_STATE
        self._save_jobs()

    def set_submitted(self, id, proc_id):
        self.status[id] = SUBMITTED_STATE
        self.proc_ids[id] = proc_id
        self._save_jobs()

    def set_running(self, id):
        self.status[id] = RUNNING_STATE
        self._save_jobs()

    def set_complete(self, id, value, duration):
        self.status[id] = COMPLETE_STATE
        self.values[id] = value
        self.durs[id] = duration
        self._save_jobs()

    def set_broken(self, id):
        self.status[id] = BROKEN_STATE
        self._save_jobs()

    def _load_jobs(self):
        fh = open(self.jobs_pkl, 'r')
        jobs = cPickle.load(fh)
        fh.close()

        self.vmap = jobs['vmap']
        self.grid = jobs['grid']
        self.status = jobs['status']
        self.values = jobs['values']
        self.durs = jobs['durs']
        self.proc_ids = jobs['proc_ids']

    def _save_jobs(self):

        # Write everything to a temporary file first.
        fh = tempfile.NamedTemporaryFile(mode='w', delete=False)
        cPickle.dump(
            {
                'vmap': self.vmap,
                'grid': self.grid,
                'status': self.status,
                'values': self.values,
                'durs': self.durs,
                'proc_ids': self.proc_ids
            },
            fh,
            protocol=-1)
        fh.close()

        # Use an atomic move for better NFS happiness.
        cmd = 'mv "%s" "%s"' % (fh.name, self.jobs_pkl)
        os.system(cmd)  # TODO: Should check system-dependent return status.

    def _hypercube_grid(self, dims, size):
        # Generate from a sobol sequence
        sobol_grid = np.transpose(i4_sobol_generate(dims, size, self.seed))

        return sobol_grid
Exemplo n.º 6
0
class GPEIOptChooser:
    def __init__(self,
                 expt_dir,
                 covar="Matern52",
                 mcmc_iters=10,
                 pending_samples=100,
                 noiseless=False,
                 burnin=100,
                 grid_subset=20,
                 use_multiprocessing=True):
        self.cov_func = getattr(gp, covar)
        self.locker = Locker()
        self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl")
        self.stats_file = os.path.join(
            expt_dir, self.__module__ + "_hyperparameters.txt")
        self.mcmc_iters = int(mcmc_iters)
        self.burnin = int(burnin)
        self.needs_burnin = True
        self.pending_samples = int(pending_samples)
        self.D = -1
        self.hyper_iters = 1
        # Number of points to optimize EI over
        self.grid_subset = int(grid_subset)
        self.noiseless = bool(int(noiseless))
        self.hyper_samples = []

        self.noise_scale = 0.1  # horseshoe prior
        self.amp2_scale = 1  # zero-mean log normal prior
        self.max_ls = 2  # top-hat prior on length scales

        # If multiprocessing fails or deadlocks, set this to False
        self.use_multiprocessing = bool(int(use_multiprocessing))

    def dump_hypers(self):
        self.locker.lock_wait(self.state_pkl)

        # Write the hyperparameters out to a Pickle.
        fh = tempfile.NamedTemporaryFile(mode='wb', delete=False)
        pickle.dump(
            {
                'dims': self.D,
                'ls': self.ls,
                'amp2': self.amp2,
                'noise': self.noise,
                'hyper_samples': self.hyper_samples,
                'mean': self.mean
            }, fh)
        fh.close()

        # Use an atomic move for better NFS happiness.
        cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl)
        os.system(cmd)  # TODO: Should check system-dependent return status.

        self.locker.unlock(self.state_pkl)

        # Write the hyperparameters out to a human readable file as well
        fh = open(self.stats_file, 'w')
        fh.write('Mean Noise Amplitude <length scales>\n')
        fh.write('-----------ALL SAMPLES-------------\n')
        meanhyps = 0 * np.hstack(self.hyper_samples[0])
        for i in self.hyper_samples:
            hyps = np.hstack(i)
            meanhyps += (1 / float(len(self.hyper_samples))) * hyps
            for j in hyps:
                fh.write(str(j) + ' ')
            fh.write('\n')

        fh.write('-----------MEAN OF SAMPLES-------------\n')
        for j in meanhyps:
            fh.write(str(j) + ' ')
        fh.write('\n')
        fh.close()

    # This passes out html or javascript to display interesting
    # stats - such as the length scales (sensitivity to various
    # dimensions).
    def generate_stats_html(self):
        # Need this because the model may not necessarily be
        # initialized when this code is called.
        if not self._read_only():
            return 'Chooser not yet ready to display output'

        mean_mean = np.mean(np.vstack([h[0] for h in self.hyper_samples]))
        mean_noise = np.mean(np.vstack([h[1] for h in self.hyper_samples]))
        mean_ls = np.mean(
            np.vstack([h[3][np.newaxis, :] for h in self.hyper_samples]), 0)

        try:
            output = (
                '<br /><span class=\"label label-info\">Estimated mean:</span> '
                + str(mean_mean) +
                '<br /><span class=\"label label-info\">Estimated noise:</span> '
                + str(mean_noise) +
                '<br /><br /><span class=\"label label-info\">Inverse parameter sensitivity'
                + ' - Gaussian Process length scales</span><br /><br />' +
                '<div id=\"lschart\"></div><script type=\"text/javascript\">' +
                'var lsdata = [' + ','.join(['%.2f' % i
                                             for i in mean_ls]) + '];')
        except:
            return 'Chooser not yet ready to display output.'

        output += ('bar_chart("#lschart", lsdata, ' + str(self.max_ls) + ');' +
                   '</script>')
        return output

    # Read in the chooser from file. Returns True only on success
    def _read_only(self):
        if os.path.exists(self.state_pkl):
            fh = open(self.state_pkl, 'r')
            state = pickle.load(fh)
            fh.close()

            self.D = state['dims']
            self.ls = state['ls']
            self.amp2 = state['amp2']
            self.noise = state['noise']
            self.mean = state['mean']
            self.hyper_samples = state['hyper_samples']
            self.needs_burnin = False
            return True

        return False

    def _real_init(self, dims, values):
        self.locker.lock_wait(self.state_pkl)

        self.randomstate = npr.get_state()
        if os.path.exists(self.state_pkl):
            fh = open(self.state_pkl, 'r')
            state = pickle.load(fh)
            fh.close()

            self.D = state['dims']
            self.ls = state['ls']
            self.amp2 = state['amp2']
            self.noise = state['noise']
            self.mean = state['mean']
            self.hyper_samples = state['hyper_samples']
            self.needs_burnin = False
        else:

            # Input dimensionality.
            self.D = dims

            # Initial length scales.
            self.ls = np.ones(self.D)

            # Initial amplitude.
            self.amp2 = np.std(values) + 1e-4

            # Initial observation noise.
            self.noise = 1e-3

            # Initial mean.
            self.mean = np.mean(values)

            # Save hyperparameter samples
            self.hyper_samples.append(
                (self.mean, self.noise, self.amp2, self.ls))

        self.locker.unlock(self.state_pkl)

    def cov(self, x1, x2=None):
        if x2 is None:
            return self.amp2 * (self.cov_func(self.ls, x1, None) +
                                1e-6 * np.eye(x1.shape[0]))
        else:
            return self.amp2 * self.cov_func(self.ls, x1, x2)

    # Given a set of completed 'experiments' in the unit hypercube with
    # corresponding objective 'values', pick from the next experiment to
    # run according to the acquisition function.
    def next(self, grid, values, durations, candidates, pending, complete):

        # Don't bother using fancy GP stuff at first.
        if complete.shape[0] < 2:
            return int(candidates[0])

        # Perform the real initialization.
        if self.D == -1:
            self._real_init(grid.shape[1], values[complete])

        # Grab out the relevant sets.
        comp = grid[complete, :]
        cand = grid[candidates, :]
        pend = grid[pending, :]
        vals = values[complete]
        numcand = cand.shape[0]

        # Spray a set of candidates around the min so far
        best_comp = np.argmin(vals)
        cand2 = np.vstack(
            (np.random.randn(10, comp.shape[1]) * 0.001 + comp[best_comp, :],
             cand))

        if self.mcmc_iters > 0:

            # Possibly burn in.
            if self.needs_burnin:
                for mcmc_iter in range(self.burnin):
                    self.sample_hypers(comp, vals)
                    log("BURN %d/%d] mean: %.2f  amp: %.2f "
                        "noise: %.4f  min_ls: %.4f  max_ls: %.4f" %
                        (mcmc_iter + 1, self.burnin, self.mean,
                         np.sqrt(self.amp2), self.noise, np.min(
                             self.ls), np.max(self.ls)))
                self.needs_burnin = False

            # Sample from hyperparameters.
            # Adjust the candidates to hit ei peaks
            self.hyper_samples = []
            for mcmc_iter in range(self.mcmc_iters):
                self.sample_hypers(comp, vals)
                log("%d/%d] mean: %.2f  amp: %.2f  noise: %.4f "
                    "min_ls: %.4f  max_ls: %.4f" %
                    (mcmc_iter + 1, self.mcmc_iters, self.mean,
                     np.sqrt(self.amp2), self.noise, np.min(
                         self.ls), np.max(self.ls)))
            self.dump_hypers()

            b = []  # optimization bounds
            for i in range(0, cand.shape[1]):
                b.append((0, 1))

            overall_ei = self.ei_over_hypers(comp, pend, cand2, vals)
            inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:]
            cand2 = cand2[inds, :]

            # Optimize each point in parallel
            if self.use_multiprocessing:
                pool = multiprocessing.Pool(self.grid_subset)
                results = [
                    pool.apply_async(optimize_pt,
                                     args=(c, b, comp, pend, vals,
                                           copy.copy(self))) for c in cand2
                ]
                for res in results:
                    cand = np.vstack((cand, res.get(1e8)))
                pool.close()
            else:
                # This is old code to optimize each point in parallel.
                for i in range(0, cand2.shape[0]):
                    log("Optimizing candidate %d/%d" % (i + 1, cand2.shape[0]))
                    #self.check_grad_ei(cand2[i,:].flatten(), comp, pend, vals)
                    ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers,
                                            cand2[i, :].flatten(),
                                            args=(comp, pend, vals),
                                            bounds=b,
                                            disp=0)
                    cand2[i, :] = ret[0]
                cand = np.vstack((cand, cand2))

            overall_ei = self.ei_over_hypers(comp, pend, cand, vals)
            best_cand = np.argmax(np.mean(overall_ei, axis=1))

            if (best_cand >= numcand):
                return (int(numcand), cand[best_cand, :])

            return int(candidates[best_cand])

        else:
            # Optimize hyperparameters
            self.optimize_hypers(comp, vals)

            log("mean: %.2f  amp: %.2f  noise: %.4f  "
                "min_ls: %.4f  max_ls: %.4f" % (self.mean, np.sqrt(
                    self.amp2), self.noise, np.min(self.ls), np.max(self.ls)))

            # Optimize over EI
            b = []  # optimization bounds
            for i in range(0, cand.shape[1]):
                b.append((0, 1))

            for i in range(0, cand2.shape[0]):
                ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei,
                                        cand2[i, :].flatten(),
                                        args=(comp, vals, True),
                                        bounds=b,
                                        disp=0)
                cand2[i, :] = ret[0]
            cand = np.vstack((cand, cand2))

            ei = self.compute_ei(comp, pend, cand, vals)
            best_cand = np.argmax(ei)

            if (best_cand >= numcand):
                return (int(numcand), cand[best_cand, :])

            return int(candidates[best_cand])

    # Compute EI over hyperparameter samples
    def ei_over_hypers(self, comp, pend, cand, vals):
        overall_ei = np.zeros((cand.shape[0], self.mcmc_iters))
        for mcmc_iter in range(self.mcmc_iters):
            hyper = self.hyper_samples[mcmc_iter]
            self.mean = hyper[0]
            self.noise = hyper[1]
            self.amp2 = hyper[2]
            self.ls = hyper[3]
            overall_ei[:, mcmc_iter] = self.compute_ei(comp, pend, cand, vals)
        return overall_ei

    def check_grad_ei(self, cand, comp, pend, vals):
        (ei, dx1) = self.grad_optimize_ei_over_hypers(cand, comp, pend, vals)
        dx2 = dx1 * 0
        idx = np.zeros(cand.shape[0])
        for i in range(0, cand.shape[0]):
            idx[i] = 1e-6
            (ei1,
             tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, pend,
                                                      vals)
            (ei2,
             tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, pend,
                                                      vals)
            dx2[i] = (ei - ei2) / (2 * 1e-6)
            idx[i] = 0
        print('computed grads', dx1)
        print('finite diffs', dx2)
        print((dx1 / dx2))
        print(np.sum((dx1 - dx2)**2))
        time.sleep(2)

    # Adjust points by optimizing EI over a set of hyperparameter samples
    def grad_optimize_ei_over_hypers(self,
                                     cand,
                                     comp,
                                     pend,
                                     vals,
                                     compute_grad=True):
        summed_ei = 0
        summed_grad_ei = np.zeros(cand.shape).flatten()
        ls = self.ls.copy()
        amp2 = self.amp2
        mean = self.mean
        noise = self.noise

        for hyper in self.hyper_samples:
            self.mean = hyper[0]
            self.noise = hyper[1]
            self.amp2 = hyper[2]
            self.ls = hyper[3]
            if compute_grad:
                (ei, g_ei) = self.grad_optimize_ei(cand, comp, pend, vals,
                                                   compute_grad)
                summed_grad_ei = summed_grad_ei + g_ei
            else:
                ei = self.grad_optimize_ei(cand, comp, pend, vals,
                                           compute_grad)
            summed_ei += ei

        self.mean = mean
        self.amp2 = amp2
        self.noise = noise
        self.ls = ls.copy()

        if compute_grad:
            return (summed_ei, summed_grad_ei)
        else:
            return summed_ei

    # Adjust points based on optimizing their ei
    def grad_optimize_ei(self, cand, comp, pend, vals, compute_grad=True):
        if pend.shape[0] == 0:
            best = np.min(vals)
            cand = np.reshape(cand, (-1, comp.shape[1]))

            # The primary covariances for prediction.
            comp_cov = self.cov(comp)
            cand_cross = self.cov(comp, cand)

            # Compute the required Cholesky.
            obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0])
            obsv_chol = spla.cholesky(obsv_cov, lower=True)

            cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__)
            cand_cross_grad = cov_grad_func(self.ls, comp, cand)

            # Predictive things.
            # Solve the linear systems.
            alpha = spla.cho_solve((obsv_chol, True), vals - self.mean)
            beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True)

            # Predict the marginal means and variances at candidates.
            func_m = np.dot(cand_cross.T, alpha) + self.mean
            func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0)

            # Expected improvement
            func_s = np.sqrt(func_v)
            u = (best - func_m) / func_s
            ncdf = sps.norm.cdf(u)
            npdf = sps.norm.pdf(u)
            ei = func_s * (u * ncdf + npdf)

            if not compute_grad:
                return ei

            # Gradients of ei w.r.t. mean and variance
            g_ei_m = -ncdf
            g_ei_s2 = 0.5 * npdf / func_s

            # Apply covariance function
            grad_cross = np.squeeze(cand_cross_grad)

            grad_xp_m = np.dot(alpha.transpose(), grad_cross)
            grad_xp_v = np.dot(
                -2 * spla.cho_solve((obsv_chol, True), cand_cross).transpose(),
                grad_cross)

            grad_xp = 0.5 * self.amp2 * (grad_xp_m * g_ei_m +
                                         grad_xp_v * g_ei_s2)
            ei = -np.sum(ei)

            return ei, grad_xp.flatten()

        else:
            # If there are pending experiments, fantasize their outcomes.
            cand = np.reshape(cand, (-1, comp.shape[1]))

            # Create a composite vector of complete and pending.
            comp_pend = np.concatenate((comp, pend))

            # Compute the covariance and Cholesky decomposition.
            comp_pend_cov = (self.cov(comp_pend) +
                             self.noise * np.eye(comp_pend.shape[0]))
            comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True)

            # Compute submatrices.
            pend_cross = self.cov(comp, pend)
            pend_kappa = self.cov(pend)

            # Use the sub-Cholesky.
            obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]]

            # Solve the linear systems.
            alpha = spla.cho_solve((obsv_chol, True), vals - self.mean)
            beta = spla.cho_solve((obsv_chol, True), pend_cross)

            # Finding predictive means and variances.
            pend_m = np.dot(pend_cross.T, alpha) + self.mean
            pend_K = pend_kappa - np.dot(pend_cross.T, beta)

            # Take the Cholesky of the predictive covariance.
            pend_chol = spla.cholesky(pend_K, lower=True)

            # Make predictions.
            npr.set_state(self.randomstate)
            pend_fant = np.dot(
                pend_chol, npr.randn(pend.shape[0],
                                     self.pending_samples)) + pend_m[:, None]

            # Include the fantasies.
            fant_vals = np.concatenate(
                (np.tile(vals[:, np.newaxis],
                         (1, self.pending_samples)), pend_fant))

            # Compute bests over the fantasies.
            bests = np.min(fant_vals, axis=0)

            # Now generalize from these fantasies.
            cand_cross = self.cov(comp_pend, cand)
            cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__)
            cand_cross_grad = cov_grad_func(self.ls, comp_pend, cand)

            # Solve the linear systems.
            alpha = spla.cho_solve((comp_pend_chol, True),
                                   fant_vals - self.mean)
            beta = spla.solve_triangular(comp_pend_chol,
                                         cand_cross,
                                         lower=True)

            # Predict the marginal means and variances at candidates.
            func_m = np.dot(cand_cross.T, alpha) + self.mean
            func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0)

            # Expected improvement
            func_s = np.sqrt(func_v[:, np.newaxis])
            u = (bests[np.newaxis, :] - func_m) / func_s
            ncdf = sps.norm.cdf(u)
            npdf = sps.norm.pdf(u)
            ei = func_s * (u * ncdf + npdf)

            # Gradients of ei w.r.t. mean and variance
            g_ei_m = -ncdf
            g_ei_s2 = 0.5 * npdf / func_s

            # Apply covariance function
            # Squeeze can break the 1D case be careful
            if pend.shape[1] == 1:
                grad_cross = np.squeeze(cand_cross_grad, axis=(2, ))
            else:
                grad_cross = np.squeeze(cand_cross_grad)

            grad_xp_m = np.dot(alpha.transpose(), grad_cross)
            grad_xp_v = np.dot(
                -2 * spla.cho_solve(
                    (comp_pend_chol, True), cand_cross).transpose(),
                grad_cross)

            grad_xp = 0.5 * self.amp2 * (
                grad_xp_m * np.tile(g_ei_m, (comp.shape[1], 1)).T +
                (grad_xp_v.T * g_ei_s2).T)
            ei = -np.mean(ei, axis=1)
            grad_xp = np.mean(grad_xp, axis=0)

            return ei, grad_xp.flatten()

    def compute_ei(self, comp, pend, cand, vals):
        if pend.shape[0] == 0:
            # If there are no pending, don't do anything fancy.

            # Current best.
            best = np.min(vals)

            # The primary covariances for prediction.
            comp_cov = self.cov(comp)
            cand_cross = self.cov(comp, cand)

            # Compute the required Cholesky.
            obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0])
            obsv_chol = spla.cholesky(obsv_cov, lower=True)

            # Solve the linear systems.
            alpha = spla.cho_solve((obsv_chol, True), vals - self.mean)
            beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True)

            # Predict the marginal means and variances at candidates.
            func_m = np.dot(cand_cross.T, alpha) + self.mean
            func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0)

            # Expected improvement
            func_s = np.sqrt(func_v)
            u = (best - func_m) / func_s
            ncdf = sps.norm.cdf(u)
            npdf = sps.norm.pdf(u)
            ei = func_s * (u * ncdf + npdf)

            return ei
        else:
            # If there are pending experiments, fantasize their outcomes.

            # Create a composite vector of complete and pending.
            comp_pend = np.concatenate((comp, pend))

            # Compute the covariance and Cholesky decomposition.
            comp_pend_cov = (self.cov(comp_pend) +
                             self.noise * np.eye(comp_pend.shape[0]))
            comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True)

            # Compute submatrices.
            pend_cross = self.cov(comp, pend)
            pend_kappa = self.cov(pend)

            # Use the sub-Cholesky.
            obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]]

            # Solve the linear systems.
            alpha = spla.cho_solve((obsv_chol, True), vals - self.mean)
            beta = spla.cho_solve((obsv_chol, True), pend_cross)

            # Finding predictive means and variances.
            pend_m = np.dot(pend_cross.T, alpha) + self.mean
            pend_K = pend_kappa - np.dot(pend_cross.T, beta)

            # Take the Cholesky of the predictive covariance.
            pend_chol = spla.cholesky(pend_K, lower=True)

            # Make predictions.
            npr.set_state(self.randomstate)
            pend_fant = np.dot(
                pend_chol, npr.randn(pend.shape[0],
                                     self.pending_samples)) + pend_m[:, None]

            # Include the fantasies.
            fant_vals = np.concatenate(
                (np.tile(vals[:, np.newaxis],
                         (1, self.pending_samples)), pend_fant))

            # Compute bests over the fantasies.
            bests = np.min(fant_vals, axis=0)

            # Now generalize from these fantasies.
            cand_cross = self.cov(comp_pend, cand)

            # Solve the linear systems.
            alpha = spla.cho_solve((comp_pend_chol, True),
                                   fant_vals - self.mean)
            beta = spla.solve_triangular(comp_pend_chol,
                                         cand_cross,
                                         lower=True)

            # Predict the marginal means and variances at candidates.
            func_m = np.dot(cand_cross.T, alpha) + self.mean
            func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0)

            # Expected improvement
            func_s = np.sqrt(func_v[:, np.newaxis])
            u = (bests[np.newaxis, :] - func_m) / func_s
            ncdf = sps.norm.cdf(u)
            npdf = sps.norm.pdf(u)
            ei = func_s * (u * ncdf + npdf)

            return np.mean(ei, axis=1)

    def sample_hypers(self, comp, vals):
        if self.noiseless:
            self.noise = 1e-3
            self._sample_noiseless(comp, vals)
        else:
            self._sample_noisy(comp, vals)
        self._sample_ls(comp, vals)
        self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls))

    def _sample_ls(self, comp, vals):
        def logprob(ls):
            if np.any(ls < 0) or np.any(ls > self.max_ls):
                return -np.inf

            cov = (
                self.amp2 *
                (self.cov_func(ls, comp, None) + 1e-6 * np.eye(comp.shape[0]))
                + self.noise * np.eye(comp.shape[0]))
            chol = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - self.mean)
            lp = (-np.sum(np.log(np.diag(chol))) -
                  0.5 * np.dot(vals - self.mean, solve))
            return lp

        self.ls = util.slice_sample(self.ls, logprob, compwise=True)

    def _sample_noisy(self, comp, vals):
        def logprob(hypers):
            mean = hypers[0]
            amp2 = hypers[1]
            noise = hypers[2]

            # This is pretty hacky, but keeps things sane.
            if mean > np.max(vals) or mean < np.min(vals):
                return -np.inf

            if amp2 < 0 or noise < 0:
                return -np.inf

            cov = (amp2 * (self.cov_func(self.ls, comp, None) +
                           1e-6 * np.eye(comp.shape[0])) +
                   noise * np.eye(comp.shape[0]))
            chol = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - mean)
            lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(
                vals - mean, solve)

            # Roll in noise horseshoe prior.
            lp += np.log(np.log(1 + (self.noise_scale / noise)**2))

            # Roll in amplitude lognormal prior
            lp -= 0.5 * (np.log(np.sqrt(amp2)) / self.amp2_scale)**2

            return lp

        hypers = util.slice_sample(np.array([self.mean, self.amp2,
                                             self.noise]),
                                   logprob,
                                   compwise=False)
        self.mean = hypers[0]
        self.amp2 = hypers[1]
        self.noise = hypers[2]

    def _sample_noiseless(self, comp, vals):
        def logprob(hypers):
            mean = hypers[0]
            amp2 = hypers[1]
            noise = 1e-3

            # This is pretty hacky, but keeps things sane.
            if mean > np.max(vals) or mean < np.min(vals):
                return -np.inf

            if amp2 < 0:
                return -np.inf

            cov = (amp2 * (self.cov_func(self.ls, comp, None) +
                           1e-6 * np.eye(comp.shape[0])) +
                   noise * np.eye(comp.shape[0]))
            chol = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - mean)
            lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(
                vals - mean, solve)

            # Roll in amplitude lognormal prior
            lp -= 0.5 * (np.log(np.sqrt(amp2)) / self.amp2_scale)**2

            return lp

        hypers = util.slice_sample(np.array([self.mean, self.amp2,
                                             self.noise]),
                                   logprob,
                                   compwise=False)
        self.mean = hypers[0]
        self.amp2 = hypers[1]
        self.noise = 1e-3

    def optimize_hypers(self, comp, vals):
        mygp = gp.GP(self.cov_func.__name__)
        mygp.real_init(comp.shape[1], vals)
        mygp.optimize_hypers(comp, vals)
        self.mean = mygp.mean
        self.ls = mygp.ls
        self.amp2 = mygp.amp2
        self.noise = mygp.noise

        # Save hyperparameter samples
        self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls))
        self.dump_hypers()

        return
Exemplo n.º 7
0
class GPEIperSecChooser:
    def __init__(self,
                 expt_dir,
                 covar="Matern52",
                 mcmc_iters=10,
                 pending_samples=100,
                 noiseless=False,
                 burnin=100,
                 grid_subset=20):
        self.cov_func = getattr(gp, covar)
        self.locker = Locker()
        self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl")

        self.stats_file = os.path.join(
            expt_dir, self.__module__ + "_hyperparameters.txt")
        self.mcmc_iters = int(mcmc_iters)
        self.burnin = int(burnin)
        self.needs_burnin = True
        self.pending_samples = pending_samples
        self.D = -1
        self.hyper_iters = 1
        # Number of points to optimize EI over
        self.grid_subset = int(grid_subset)
        self.noiseless = bool(int(noiseless))
        self.hyper_samples = []
        self.time_hyper_samples = []

        self.noise_scale = 0.1  # horseshoe prior
        self.amp2_scale = 1  # zero-mean log normal prior
        self.max_ls = 10  # top-hat prior on length scales

        self.time_noise_scale = 0.1  # horseshoe prior
        self.time_amp2_scale = 1  # zero-mean log normal prior
        self.time_max_ls = 10  # top-hat prior on length scales

    # A simple function to dump out hyperparameters to allow for a hot start
    # if the optimization is restarted.
    def dump_hypers(self):
        self.locker.lock_wait(self.state_pkl)

        # Write the hyperparameters out to a Pickle.
        fh = tempfile.NamedTemporaryFile(mode='w', delete=False)
        cPickle.dump(
            {
                'dims': self.D,
                'ls': self.ls,
                'amp2': self.amp2,
                'noise': self.noise,
                'mean': self.mean,
                'time_ls': self.time_ls,
                'time_amp2': self.time_amp2,
                'time_noise': self.time_noise,
                'time_mean': self.time_mean
            }, fh)
        fh.close()

        # Use an atomic move for better NFS happiness.
        cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl)
        os.system(cmd)  # TODO: Should check system-dependent return status.

        self.locker.unlock(self.state_pkl)

    def _real_init(self, dims, values, durations):
        self.locker.lock_wait(self.state_pkl)

        if os.path.exists(self.state_pkl):
            fh = open(self.state_pkl, 'r')
            state = cPickle.load(fh)
            fh.close()

            self.D = state['dims']
            self.ls = state['ls']
            self.amp2 = state['amp2']
            self.noise = state['noise']
            self.mean = state['mean']
            self.time_ls = state['time_ls']
            self.time_amp2 = state['time_amp2']
            self.time_noise = state['time_noise']
            self.time_mean = state['time_mean']
        else:

            # Input dimensionality.
            self.D = dims

            # Initial length scales.
            self.ls = np.ones(self.D)
            self.time_ls = np.ones(self.D)

            # Initial amplitude.
            self.amp2 = np.std(values) + 1e-4
            self.time_amp2 = np.std(durations) + 1e-4

            # Initial observation noise.
            self.noise = 1e-3
            self.time_noise = 1e-3

            # Initial mean.
            self.mean = np.mean(values)
            self.time_mean = np.mean(np.log(durations))

        self.locker.unlock(self.state_pkl)

    def cov(self, amp2, ls, x1, x2=None):
        if x2 is None:
            return amp2 * (self.cov_func(ls, x1, None) +
                           1e-6 * np.eye(x1.shape[0]))
        else:
            return amp2 * self.cov_func(ls, x1, x2)

    # Given a set of completed 'experiments' in the unit hypercube with
    # corresponding objective 'values', pick from the next experiment to
    # run according to the acquisition function.
    def next(self, grid, values, durations, candidates, pending, complete):

        # Don't bother using fancy GP stuff at first.
        if complete.shape[0] < 2:
            return int(candidates[0])

        # Perform the real initialization.
        if self.D == -1:
            self._real_init(grid.shape[1], values[complete],
                            durations[complete])

        # Grab out the relevant sets.
        comp = grid[complete, :]
        cand = grid[candidates, :]
        pend = grid[pending, :]
        vals = values[complete]
        durs = durations[complete]

        # Bring time into the log domain before we do anything
        # to maintain strict positivity
        durs = np.log(durs)

        # Spray a set of candidates around the min so far
        numcand = cand.shape[0]
        best_comp = np.argmin(vals)
        cand2 = np.vstack(
            (np.random.randn(10, comp.shape[1]) * 0.001 + comp[best_comp, :],
             cand))

        if self.mcmc_iters > 0:

            # Possibly burn in.
            if self.needs_burnin:
                for mcmc_iter in range(self.burnin):
                    self.sample_hypers(comp, vals, durs)
                    log("BURN %d/%d] mean: %.2f  amp: %.2f "
                        "noise: %.4f  min_ls: %.4f  max_ls: %.4f" %
                        (mcmc_iter + 1, self.burnin, self.mean,
                         np.sqrt(self.amp2), self.noise, np.min(
                             self.ls), np.max(self.ls)))
                self.needs_burnin = False

            # Sample from hyperparameters.
            # Adjust the candidates to hit ei/sec peaks
            self.hyper_samples = []
            for mcmc_iter in range(self.mcmc_iters):
                self.sample_hypers(comp, vals, durs)
                log("%d/%d] mean: %.2f  amp: %.2f  noise: %.4f "
                    "min_ls: %.4f  max_ls: %.4f" %
                    (mcmc_iter + 1, self.mcmc_iters, self.mean,
                     np.sqrt(self.amp2), self.noise, np.min(
                         self.ls), np.max(self.ls)))

                log("%d/%d] time_mean: %.2fs time_amp: %.2f  time_noise: %.4f "
                    "time_min_ls: %.4f  time_max_ls: %.4f" %
                    (mcmc_iter + 1, self.mcmc_iters, np.exp(self.time_mean),
                     np.sqrt(self.time_amp2), np.exp(self.time_noise),
                     np.min(self.time_ls), np.max(self.time_ls)))
            self.dump_hypers()

            # Pick the top candidates to optimize over
            overall_ei = self.ei_over_hypers(comp, pend, cand2, vals, durs)
            inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:]
            cand2 = cand2[inds, :]

            # Adjust the candidates to hit ei peaks
            b = []  # optimization bounds
            for i in range(0, cand.shape[1]):
                b.append((0, 1))

            for i in range(0, cand2.shape[0]):
                log("Optimizing candidate %d/%d" % (i + 1, cand2.shape[0]))
                ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers,
                                        cand2[i, :].flatten(),
                                        args=(comp, vals, durs, True),
                                        bounds=b,
                                        disp=0)
                cand2[i, :] = ret[0]

            cand = np.vstack((cand, cand2))

            overall_ei = self.ei_over_hypers(comp, pend, cand, vals, durs)
            best_cand = np.argmax(np.mean(overall_ei, axis=1))
            self.dump_hypers()
            if (best_cand >= numcand):
                return (int(numcand), cand[best_cand, :])

            return int(candidates[best_cand])

        else:
            # Optimize hyperparameters
            self.optimize_hypers(comp, vals, durs)

            log("mean: %f  amp: %f  noise: %f "
                "min_ls: %f  max_ls: %f" % (self.mean, np.sqrt(
                    self.amp2), self.noise, np.min(self.ls), np.max(self.ls)))

            # Pick the top candidates to optimize over
            ei = self.compute_ei_per_s(comp, pend, cand2, vals, durs)
            inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:]
            cand2 = cand2[inds, :]

            # Adjust the candidates to hit ei peaks
            b = []  # optimization bounds
            for i in range(0, cand.shape[1]):
                b.append((0, 1))

            for i in range(0, cand2.shape[0]):
                log("Optimizing candidate %d/%d" % (i + 1, cand2.shape[0]))
                ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei,
                                        cand2[i, :].flatten(),
                                        args=(comp, vals, durs, True),
                                        bounds=b,
                                        disp=0)
                cand2[i, :] = ret[0]

            cand = np.vstack((cand, cand2))
            ei = self.compute_ei_per_s(comp, pend, cand, vals, durs)

            best_cand = np.argmax(ei)
            self.dump_hypers()

            if (best_cand >= numcand):
                return (int(numcand), cand[best_cand, :])

            return int(candidates[best_cand])

    # Compute EI over hyperparameter samples
    def ei_over_hypers(self, comp, pend, cand, vals, durs):
        overall_ei = np.zeros((cand.shape[0], self.mcmc_iters))
        for mcmc_iter in range(self.mcmc_iters):
            hyper = self.hyper_samples[mcmc_iter]
            time_hyper = self.time_hyper_samples[mcmc_iter]
            self.mean = hyper[0]
            self.noise = hyper[1]
            self.amp2 = hyper[2]
            self.ls = hyper[3]

            self.time_mean = time_hyper[0]
            self.time_noise = time_hyper[1]
            self.time_amp2 = time_hyper[2]
            self.time_ls = time_hyper[3]

            overall_ei[:, mcmc_iter] = self.compute_ei_per_s(
                comp, pend, cand, vals, durs.squeeze())

            return overall_ei

    def check_grad_ei_per(self, cand, comp, vals, durs):
        (ei, dx1) = self.grad_optimize_ei_over_hypers(cand, comp, vals, durs)
        dx2 = dx1 * 0
        idx = np.zeros(cand.shape[0])
        for i in range(0, cand.shape[0]):
            idx[i] = 1e-6
            (ei1,
             tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, vals,
                                                      durs)
            (ei2,
             tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, vals,
                                                      durs)
            dx2[i] = (ei - ei2) / (2 * 1e-6)
            idx[i] = 0
        print('computed grads', dx1)
        print('finite diffs', dx2)
        print(dx1 / dx2)
        print(np.sum((dx1 - dx2)**2))
        time.sleep(2)

    # Adjust points by optimizing EI over a set of hyperparameter samples
    def grad_optimize_ei_over_hypers(self,
                                     cand,
                                     comp,
                                     vals,
                                     durs,
                                     compute_grad=True):
        summed_ei = 0
        summed_grad_ei = np.zeros(cand.shape).flatten()

        for mcmc_iter in range(self.mcmc_iters):
            hyper = self.hyper_samples[mcmc_iter]
            time_hyper = self.time_hyper_samples[mcmc_iter]
            self.mean = hyper[0]
            self.noise = hyper[1]
            self.amp2 = hyper[2]
            self.ls = hyper[3]

            self.time_mean = time_hyper[0]
            self.time_noise = time_hyper[1]
            self.time_amp2 = time_hyper[2]
            self.time_ls = time_hyper[3]

            if compute_grad:
                (ei, g_ei) = self.grad_optimize_ei(cand, comp, vals, durs,
                                                   compute_grad)
                summed_grad_ei = summed_grad_ei + g_ei
            else:
                ei = self.grad_optimize_ei(cand, comp, vals, durs,
                                           compute_grad)

            summed_ei += ei

        if compute_grad:
            return (summed_ei, summed_grad_ei)
        else:
            return summed_ei

    def grad_optimize_ei(self, cand, comp, vals, durs, compute_grad=True):
        # Here we have to compute the gradients for ei per second
        # This means deriving through the two kernels, the one for predicting
        # time and the one predicting ei
        best = np.min(vals)
        cand = np.reshape(cand, (-1, comp.shape[1]))

        # First we make predictions for the durations
        # Compute covariances
        comp_time_cov = self.cov(self.time_amp2, self.time_ls, comp)
        cand_time_cross = self.cov(self.time_amp2, self.time_ls, comp, cand)

        # Cholesky decompositions
        obsv_time_cov = comp_time_cov + self.time_noise * np.eye(comp.shape[0])
        obsv_time_chol = spla.cholesky(obsv_time_cov, lower=True)

        # Linear systems
        t_alpha = spla.cho_solve((obsv_time_chol, True), durs - self.time_mean)

        # Predict marginal mean times and (possibly) variances
        func_time_m = np.dot(cand_time_cross.T, t_alpha) + self.time_mean

        # We don't really need the time variances now
        #func_time_v = self.time_amp2*(1+1e-6) - np.sum(t_beta**2, axis=0)

        # Bring time out of the log domain
        func_time_m = np.exp(func_time_m)

        # Compute derivative of cross-distances.
        grad_cross_r = gp.grad_dist2(self.time_ls, comp, cand)

        # Apply covariance function
        cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__)
        cand_cross_grad = cov_grad_func(self.time_ls, comp, cand)
        grad_cross_t = np.squeeze(cand_cross_grad)

        # Now compute the gradients w.r.t. ei
        # The primary covariances for prediction.
        comp_cov = self.cov(self.amp2, self.ls, comp)
        cand_cross = self.cov(self.amp2, self.ls, comp, cand)

        # Compute the required Cholesky.
        obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0])
        obsv_chol = spla.cholesky(obsv_cov, lower=True)

        cand_cross_grad = cov_grad_func(self.ls, comp, cand)

        # Predictive things.
        # Solve the linear systems.
        alpha = spla.cho_solve((obsv_chol, True), vals - self.mean)
        beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True)

        # Predict the marginal means and variances at candidates.
        func_m = np.dot(cand_cross.T, alpha) + self.mean
        func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0)

        # Expected improvement
        func_s = np.sqrt(func_v)
        u = (best - func_m) / func_s
        ncdf = sps.norm.cdf(u)
        npdf = sps.norm.pdf(u)
        ei = func_s * (u * ncdf + npdf)

        ei_per_s = -np.sum(ei / func_time_m)
        if not compute_grad:
            return ei

        grad_time_xp_m = np.dot(t_alpha.transpose(), grad_cross_t)

        # Gradients of ei w.r.t. mean and variance
        g_ei_m = -ncdf
        g_ei_s2 = 0.5 * npdf / func_s

        # Apply covariance function
        grad_cross = np.squeeze(cand_cross_grad)

        grad_xp_m = np.dot(alpha.transpose(), grad_cross)
        grad_xp_v = np.dot(
            -2 * spla.cho_solve((obsv_chol, True), cand_cross).transpose(),
            grad_cross)

        grad_xp = 0.5 * self.amp2 * (grad_xp_m * g_ei_m + grad_xp_v * g_ei_s2)
        grad_time_xp_m = 0.5 * self.time_amp2 * grad_time_xp_m * func_time_m
        grad_xp = (func_time_m * grad_xp - ei * grad_time_xp_m) / (func_time_m
                                                                   **2)

        return ei_per_s, grad_xp.flatten()

    def compute_ei_per_s(self, comp, pend, cand, vals, durs):
        # First we make predictions for the durations as that
        # doesn't depend on pending experiments

        # Compute covariances
        comp_time_cov = self.cov(self.time_amp2, self.time_ls, comp)
        cand_time_cross = self.cov(self.time_amp2, self.time_ls, comp, cand)

        # Cholesky decompositions
        obsv_time_cov = comp_time_cov + self.time_noise * np.eye(comp.shape[0])
        obsv_time_chol = spla.cholesky(obsv_time_cov, lower=True)

        # Linear systems
        t_alpha = spla.cho_solve((obsv_time_chol, True), durs - self.time_mean)
        #t_beta   = spla.solve_triangular(obsv_time_chol, cand_time_cross, lower=True)

        # Predict marginal mean times and (possibly) variances
        func_time_m = np.dot(cand_time_cross.T, t_alpha) + self.time_mean
        # We don't really need the time variances now
        #func_time_v = self.time_amp2*(1+1e-6) - np.sum(t_beta**2, axis=0)

        # Bring time out of the log domain
        func_time_m = np.exp(func_time_m)

        if pend.shape[0] == 0:
            # If there are no pending, don't do anything fancy.

            # Current best.
            best = np.min(vals)

            # The primary covariances for prediction.
            comp_cov = self.cov(self.amp2, self.ls, comp)
            cand_cross = self.cov(self.amp2, self.ls, comp, cand)

            # Compute the required Cholesky.
            obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0])
            obsv_chol = spla.cholesky(obsv_cov, lower=True)

            # Solve the linear systems.
            alpha = spla.cho_solve((obsv_chol, True), vals - self.mean)
            beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True)

            # Predict the marginal means and variances at candidates.
            func_m = np.dot(cand_cross.T, alpha) + self.mean
            func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0)

            # Expected improvement
            func_s = np.sqrt(func_v)
            u = (best - func_m) / func_s
            ncdf = sps.norm.cdf(u)
            npdf = sps.norm.pdf(u)
            ei = func_s * (u * ncdf + npdf)

            ei_per_s = ei / func_time_m
            return ei_per_s
        else:
            # If there are pending experiments, fantasize their outcomes.

            # Create a composite vector of complete and pending.
            comp_pend = np.concatenate((comp, pend))

            # Compute the covariance and Cholesky decomposition.
            comp_pend_cov = self.cov(
                self.amp2, self.ls,
                comp_pend) + self.noise * np.eye(comp_pend.shape[0])
            comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True)

            # Compute submatrices.
            pend_cross = self.cov(self.amp2, self.ls, comp, pend)
            pend_kappa = self.cov(self.amp2, self.ls, pend)

            # Use the sub-Cholesky.
            obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]]

            # Solve the linear systems.
            alpha = spla.cho_solve((obsv_chol, True), vals - self.mean)
            beta = spla.cho_solve((obsv_chol, True), pend_cross)

            # Finding predictive means and variances.
            pend_m = np.dot(pend_cross.T, alpha) + self.mean
            pend_K = pend_kappa - np.dot(pend_cross.T, beta)

            # Take the Cholesky of the predictive covariance.
            pend_chol = spla.cholesky(pend_K, lower=True)

            # Make predictions.
            pend_fant = np.dot(
                pend_chol, npr.randn(pend.shape[0],
                                     self.pending_samples)) + pend_m[:, None]

            # Include the fantasies.
            fant_vals = np.concatenate(
                (np.tile(vals[:, np.newaxis],
                         (1, self.pending_samples)), pend_fant))

            # Compute bests over the fantasies.
            bests = np.min(fant_vals, axis=0)

            # Now generalize from these fantasies.
            cand_cross = self.cov(self.amp2, self.ls, comp_pend, cand)

            # Solve the linear systems.
            alpha = spla.cho_solve((comp_pend_chol, True),
                                   fant_vals - self.mean)
            beta = spla.solve_triangular(comp_pend_chol,
                                         cand_cross,
                                         lower=True)

            # Predict the marginal means and variances at candidates.
            func_m = np.dot(cand_cross.T, alpha) + self.mean
            func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0)

            # Expected improvement
            func_s = np.sqrt(func_v[:, np.newaxis])
            u = (bests[np.newaxis, :] - func_m) / func_s
            ncdf = sps.norm.cdf(u)
            npdf = sps.norm.pdf(u)
            ei = func_s * (u * ncdf + npdf)

            return np.divide(np.mean(ei, axis=1), func_time_m)

    def sample_hypers(self, comp, vals, durs):
        if self.noiseless:
            self.noise = 1e-3
            self._sample_noiseless(comp, vals)
        else:
            self._sample_noisy(comp, vals)
        self._sample_ls(comp, vals)

        self._sample_time_noisy(comp, durs.squeeze())
        self._sample_time_ls(comp, durs.squeeze())

        self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls))
        self.time_hyper_samples.append(
            (self.time_mean, self.time_noise, self.time_amp2, self.time_ls))

    def _sample_ls(self, comp, vals):
        def logprob(ls):
            if np.any(ls < 0) or np.any(ls > self.max_ls):
                return -np.inf

            cov = self.amp2 * (self.cov_func(ls, comp, None) + 1e-6 * np.eye(
                comp.shape[0])) + self.noise * np.eye(comp.shape[0])
            chol = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - self.mean)
            lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(
                vals - self.mean, solve)
            return lp

        self.ls = util.slice_sample(self.ls, logprob, compwise=True)

    def _sample_time_ls(self, comp, vals):
        def logprob(ls):
            if np.any(ls < 0) or np.any(ls > self.time_max_ls):
                return -np.inf

            cov = self.time_amp2 * (self.cov_func(ls, comp, None) +
                                    1e-6 * np.eye(comp.shape[0])
                                    ) + self.time_noise * np.eye(comp.shape[0])
            chol = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - self.time_mean)
            lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(
                vals - self.time_mean, solve)
            return lp

        self.time_ls = util.slice_sample(self.time_ls, logprob, compwise=True)

    def _sample_noisy(self, comp, vals):
        def logprob(hypers):
            mean = hypers[0]
            amp2 = hypers[1]
            noise = hypers[2]

            # This is pretty hacky, but keeps things sane.
            if mean > np.max(vals) or mean < np.min(vals):
                return -np.inf

            if amp2 < 0 or noise < 0:
                return -np.inf

            cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye(
                comp.shape[0])) + noise * np.eye(comp.shape[0])
            chol = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - mean)
            lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(
                vals - mean, solve)

            # Roll in noise horseshoe prior.
            lp += np.log(np.log(1 + (self.noise_scale / noise)**2))
            #lp -= 0.5*(np.log(noise)/self.noise_scale)**2

            # Roll in amplitude lognormal prior
            lp -= 0.5 * (np.log(amp2) / self.amp2_scale)**2

            return lp

        hypers = util.slice_sample(np.array([self.mean, self.amp2,
                                             self.noise]),
                                   logprob,
                                   compwise=False)
        self.mean = hypers[0]
        self.amp2 = hypers[1]
        self.noise = hypers[2]

    def _sample_time_noisy(self, comp, vals):
        def logprob(hypers):
            mean = hypers[0]
            amp2 = hypers[1]
            noise = hypers[2]

            # This is pretty hacky, but keeps things sane.
            if mean > np.max(vals) or mean < np.min(vals):
                return -np.inf

            if amp2 < 0 or noise < 0:
                return -np.inf

            cov = amp2 * (self.cov_func(self.time_ls, comp, None) +
                          1e-6 * np.eye(comp.shape[0])) + noise * np.eye(
                              comp.shape[0])
            chol = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - mean)
            lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(
                vals - mean, solve)

            # Roll in noise horseshoe prior.
            lp += np.log(np.log(1 + (self.time_noise_scale / noise)**2))
            #lp -= 0.5*(np.log(noise)/self.time_noise_scale)**2

            # Roll in amplitude lognormal prior
            lp -= 0.5 * (np.log(np.sqrt(amp2)) / self.time_amp2_scale)**2

            return lp

        hypers = util.slice_sample(np.array(
            [self.time_mean, self.time_amp2, self.time_noise]),
                                   logprob,
                                   compwise=False)
        self.time_mean = hypers[0]
        self.time_amp2 = hypers[1]
        self.time_noise = hypers[2]

    def _sample_noiseless(self, comp, vals):
        def logprob(hypers):
            mean = hypers[0]
            amp2 = hypers[1]
            noise = 1e-3

            # This is pretty hacky, but keeps things sane.
            if mean > np.max(vals) or mean < np.min(vals):
                return -np.inf

            if amp2 < 0:
                return -np.inf

            cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye(
                comp.shape[0])) + noise * np.eye(comp.shape[0])
            chol = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - mean)
            lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(
                vals - mean, solve)

            # Roll in amplitude lognormal prior
            lp -= 0.5 * (np.log(amp2) / self.amp2_scale)**2

            return lp

        hypers = util.slice_sample(np.array([self.mean, self.amp2,
                                             self.noise]),
                                   logprob,
                                   compwise=False)
        self.mean = hypers[0]
        self.amp2 = hypers[1]
        self.noise = 1e-3

    def optimize_hypers(self, comp, vals, durs):
        # First the GP to observations
        mygp = gp.GP(self.cov_func.__name__)
        mygp.real_init(comp.shape[1], vals)
        mygp.optimize_hypers(comp, vals)
        self.mean = mygp.mean
        self.ls = mygp.ls
        self.amp2 = mygp.amp2
        self.noise = mygp.noise

        # Now the GP to times
        timegp = gp.GP(self.cov_func.__name__)
        timegp.real_init(comp.shape[1], durs)
        timegp.optimize_hypers(comp, durs)
        self.time_mean = timegp.mean
        self.time_amp2 = timegp.amp2
        self.time_noise = timegp.noise
        self.time_ls = timegp.ls

        # Save hyperparameter samples
        self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls))
        self.time_hyper_samples.append(
            (self.time_mean, self.time_noise, self.time_amp2, self.time_ls))
        self.dump_hypers()
Exemplo n.º 8
0
class GPEIOptChooser:

    def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10,
                 pending_samples=100, noiseless=False, burnin=100,
                 grid_subset=20, use_multiprocessing=True):
        self.cov_func        = getattr(gp, covar)
        self.locker          = Locker()
        self.state_pkl       = os.path.join(expt_dir, self.__module__ + ".pkl")
        self.stats_file      = os.path.join(expt_dir,
                                   self.__module__ + "_hyperparameters.txt")
        self.mcmc_iters      = int(mcmc_iters)
        self.burnin          = int(burnin)
        self.needs_burnin    = True
        self.pending_samples = int(pending_samples)
        self.D               = -1
        self.hyper_iters     = 1
        # Number of points to optimize EI over
        self.grid_subset     = int(grid_subset)
        self.noiseless       = bool(int(noiseless))
        self.hyper_samples = []

        self.noise_scale = 0.1  # horseshoe prior
        self.amp2_scale  = 1    # zero-mean log normal prior
        self.max_ls      = 2    # top-hat prior on length scales

        # If multiprocessing fails or deadlocks, set this to False
        self.use_multiprocessing = bool(int(use_multiprocessing))


    def dump_hypers(self):
        self.locker.lock_wait(self.state_pkl)

        # Write the hyperparameters out to a Pickle.
        fh = tempfile.NamedTemporaryFile(mode='w', delete=False)
        cPickle.dump({ 'dims'          : self.D,
                       'ls'            : self.ls,
                       'amp2'          : self.amp2,
                       'noise'         : self.noise,
                       'hyper_samples' : self.hyper_samples,
                       'mean'          : self.mean },
                     fh)
        fh.close()

        # Use an atomic move for better NFS happiness.
        cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl)
        os.system(cmd) # TODO: Should check system-dependent return status.

        self.locker.unlock(self.state_pkl)

        # Write the hyperparameters out to a human readable file as well
        fh    = open(self.stats_file, 'w')
        fh.write('Mean Noise Amplitude <length scales>\n')
        fh.write('-----------ALL SAMPLES-------------\n')
        meanhyps = 0*np.hstack(self.hyper_samples[0])
        for i in self.hyper_samples:
            hyps = np.hstack(i)
            meanhyps += (1/float(len(self.hyper_samples)))*hyps
            for j in hyps:
                fh.write(str(j) + ' ')
            fh.write('\n')

        fh.write('-----------MEAN OF SAMPLES-------------\n')
        for j in meanhyps:
            fh.write(str(j) + ' ')
        fh.write('\n')
        fh.close()

    # This passes out html or javascript to display interesting
    # stats - such as the length scales (sensitivity to various
    # dimensions).
    def generate_stats_html(self):
        # Need this because the model may not necessarily be
        # initialized when this code is called.
        if not self._read_only():
            return 'Chooser not yet ready to display output'

        mean_mean  = np.mean(np.vstack([h[0] for h in self.hyper_samples]))
        mean_noise = np.mean(np.vstack([h[1] for h in self.hyper_samples]))
        mean_ls    = np.mean(np.vstack([h[3][np.newaxis,:] for h in self.hyper_samples]),0)

        try:
            output = (
                '<br /><span class=\"label label-info\">Estimated mean:</span> ' + str(mean_mean) +
                '<br /><span class=\"label label-info\">Estimated noise:</span> ' + str(mean_noise) +
                '<br /><br /><span class=\"label label-info\">Inverse parameter sensitivity' +
                ' - Gaussian Process length scales</span><br /><br />' +
                '<div id=\"lschart\"></div><script type=\"text/javascript\">' +
                'var lsdata = [' + ','.join(['%.2f' % i for i in mean_ls]) + '];')
        except:
            return 'Chooser not yet ready to display output.'

        output += ('bar_chart("#lschart", lsdata, ' + str(self.max_ls) + ');' +
                   '</script>')
        return output

    # Read in the chooser from file. Returns True only on success
    def _read_only(self):
        if os.path.exists(self.state_pkl):
            fh    = open(self.state_pkl, 'r')
            state = cPickle.load(fh)
            fh.close()

            self.D             = state['dims']
            self.ls            = state['ls']
            self.amp2          = state['amp2']
            self.noise         = state['noise']
            self.mean          = state['mean']
            self.hyper_samples = state['hyper_samples']
            self.needs_burnin  = False
            return True

        return False

    def _real_init(self, dims, values):
        self.locker.lock_wait(self.state_pkl)

        self.randomstate = npr.get_state()
        if os.path.exists(self.state_pkl):
            fh    = open(self.state_pkl, 'r')
            state = cPickle.load(fh)
            fh.close()

            self.D             = state['dims']
            self.ls            = state['ls']
            self.amp2          = state['amp2']
            self.noise         = state['noise']
            self.mean          = state['mean']
            self.hyper_samples = state['hyper_samples']
            self.needs_burnin  = False
        else:

            # Input dimensionality.
            self.D = dims

            # Initial length scales.
            self.ls = np.ones(self.D)

            # Initial amplitude.
            self.amp2 = np.std(values)+1e-4

            # Initial observation noise.
            self.noise = 1e-3

            # Initial mean.
            self.mean = np.mean(values)

            # Save hyperparameter samples
            self.hyper_samples.append((self.mean, self.noise, self.amp2,
                                       self.ls))

        self.locker.unlock(self.state_pkl)

    def cov(self, x1, x2=None):
        if x2 is None:
            return self.amp2 * (self.cov_func(self.ls, x1, None)
                               + 1e-6*np.eye(x1.shape[0]))
        else:
            return self.amp2 * self.cov_func(self.ls, x1, x2)

    # Given a set of completed 'experiments' in the unit hypercube with
    # corresponding objective 'values', pick from the next experiment to
    # run according to the acquisition function.
    def next(self, grid, values, durations,
             candidates, pending, complete):

        # Don't bother using fancy GP stuff at first.
        if complete.shape[0] < 2:
            return int(candidates[0])

        # Perform the real initialization.
        if self.D == -1:
            self._real_init(grid.shape[1], values[complete])

        # Grab out the relevant sets.
        comp = grid[complete,:]
        cand = grid[candidates,:]
        pend = grid[pending,:]
        vals = values[complete]
        numcand = cand.shape[0]

        # Spray a set of candidates around the min so far
        best_comp = np.argmin(vals)
        cand2 = np.vstack((np.random.randn(10,comp.shape[1])*0.001 +
                           comp[best_comp,:], cand))

        if self.mcmc_iters > 0:

            # Possibly burn in.
            if self.needs_burnin:
                for mcmc_iter in xrange(self.burnin):
                    self.sample_hypers(comp, vals)
                    log("BURN %d/%d] mean: %.2f  amp: %.2f "
                                     "noise: %.4f  min_ls: %.4f  max_ls: %.4f"
                                     % (mcmc_iter+1, self.burnin, self.mean,
                                        np.sqrt(self.amp2), self.noise,
                                        np.min(self.ls), np.max(self.ls)))
                self.needs_burnin = False

            # Sample from hyperparameters.
            # Adjust the candidates to hit ei peaks
            self.hyper_samples = []
            for mcmc_iter in xrange(self.mcmc_iters):
                self.sample_hypers(comp, vals)
                log("%d/%d] mean: %.2f  amp: %.2f  noise: %.4f "
                                 "min_ls: %.4f  max_ls: %.4f"
                                 % (mcmc_iter+1, self.mcmc_iters, self.mean,
                                    np.sqrt(self.amp2), self.noise,
                                    np.min(self.ls), np.max(self.ls)))
            self.dump_hypers()

            b = []# optimization bounds
            for i in xrange(0, cand.shape[1]):
                b.append((0, 1))

            overall_ei = self.ei_over_hypers(comp,pend,cand2,vals)
            inds = np.argsort(np.mean(overall_ei,axis=1))[-self.grid_subset:]
            cand2 = cand2[inds,:]

            # Optimize each point in parallel
            if self.use_multiprocessing:
                pool = multiprocessing.Pool(self.grid_subset)
                results = [pool.apply_async(optimize_pt,args=(
                            c,b,comp,pend,vals,copy.copy(self))) for c in cand2]
                for res in results:
                    cand = np.vstack((cand, res.get(1e8)))
                pool.close()
            else:
                # This is old code to optimize each point in parallel.
                for i in xrange(0, cand2.shape[0]):
                    log("Optimizing candidate %d/%d" %
                        (i+1, cand2.shape[0]))
                    #self.check_grad_ei(cand2[i,:].flatten(), comp, pend, vals)
                    ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers,
                                            cand2[i,:].flatten(), args=(comp,pend,vals),
                                            bounds=b, disp=0)
                    cand2[i,:] = ret[0]
                cand = np.vstack((cand, cand2))

            overall_ei = self.ei_over_hypers(comp,pend,cand,vals)
            best_cand = np.argmax(np.mean(overall_ei, axis=1))

            if (best_cand >= numcand):
                return (int(numcand), cand[best_cand,:])

            return int(candidates[best_cand])

        else:
            # Optimize hyperparameters
            self.optimize_hypers(comp, vals)

            log("mean: %.2f  amp: %.2f  noise: %.4f  "
                             "min_ls: %.4f  max_ls: %.4f"
                             % (self.mean, np.sqrt(self.amp2), self.noise,
                                np.min(self.ls), np.max(self.ls)))

            # Optimize over EI
            b = []# optimization bounds
            for i in xrange(0, cand.shape[1]):
                b.append((0, 1))

            for i in xrange(0, cand2.shape[0]):
                ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei,
                                        cand2[i,:].flatten(), args=(comp,vals,True),
                                        bounds=b, disp=0)
                cand2[i,:] = ret[0]
            cand = np.vstack((cand, cand2))

            ei = self.compute_ei(comp, pend, cand, vals)
            best_cand = np.argmax(ei)

            if (best_cand >= numcand):
                return (int(numcand), cand[best_cand,:])

            return int(candidates[best_cand])

    # Compute EI over hyperparameter samples
    def ei_over_hypers(self,comp,pend,cand,vals):
        overall_ei = np.zeros((cand.shape[0], self.mcmc_iters))
        for mcmc_iter in xrange(self.mcmc_iters):
            hyper = self.hyper_samples[mcmc_iter]
            self.mean = hyper[0]
            self.noise = hyper[1]
            self.amp2 = hyper[2]
            self.ls = hyper[3]
            overall_ei[:,mcmc_iter] = self.compute_ei(comp, pend, cand,
                                                      vals)
        return overall_ei

    def check_grad_ei(self, cand, comp, pend, vals):
        (ei,dx1) = self.grad_optimize_ei_over_hypers(cand, comp, pend, vals)
        dx2 = dx1*0
        idx = np.zeros(cand.shape[0])
        for i in xrange(0, cand.shape[0]):
            idx[i] = 1e-6
            (ei1,tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, pend, vals)
            (ei2,tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, pend, vals)
            dx2[i] = (ei - ei2)/(2*1e-6)
            idx[i] = 0
        print 'computed grads', dx1
        print 'finite diffs', dx2
        print (dx1/dx2)
        print np.sum((dx1 - dx2)**2)
        time.sleep(2)

    # Adjust points by optimizing EI over a set of hyperparameter samples
    def grad_optimize_ei_over_hypers(self, cand, comp, pend, vals, compute_grad=True):
        summed_ei = 0
        summed_grad_ei = np.zeros(cand.shape).flatten()
        ls = self.ls.copy()
        amp2 = self.amp2
        mean = self.mean
        noise = self.noise

        for hyper in self.hyper_samples:
            self.mean = hyper[0]
            self.noise = hyper[1]
            self.amp2 = hyper[2]
            self.ls = hyper[3]
            if compute_grad:
                (ei,g_ei) = self.grad_optimize_ei(cand,comp,pend,vals,compute_grad)
                summed_grad_ei = summed_grad_ei + g_ei
            else:
                ei = self.grad_optimize_ei(cand,comp,pend,vals,compute_grad)
            summed_ei += ei

        self.mean = mean
        self.amp2 = amp2
        self.noise = noise
        self.ls = ls.copy()

        if compute_grad:
            return (summed_ei, summed_grad_ei)
        else:
            return summed_ei

    # Adjust points based on optimizing their ei
    def grad_optimize_ei(self, cand, comp, pend, vals, compute_grad=True):
        if pend.shape[0] == 0:
            best = np.min(vals)
            cand = np.reshape(cand, (-1, comp.shape[1]))

            # The primary covariances for prediction.
            comp_cov   = self.cov(comp)
            cand_cross = self.cov(comp, cand)

            # Compute the required Cholesky.
            obsv_cov  = comp_cov + self.noise*np.eye(comp.shape[0])
            obsv_chol = spla.cholesky(obsv_cov, lower=True)

            cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__)
            cand_cross_grad = cov_grad_func(self.ls, comp, cand)

            # Predictive things.
            # Solve the linear systems.
            alpha  = spla.cho_solve((obsv_chol, True), vals - self.mean)
            beta   = spla.solve_triangular(obsv_chol, cand_cross, lower=True)

            # Predict the marginal means and variances at candidates.
            func_m = np.dot(cand_cross.T, alpha) + self.mean
            func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0)

            # Expected improvement
            func_s = np.sqrt(func_v)
            u      = (best - func_m) / func_s
            ncdf   = sps.norm.cdf(u)
            npdf   = sps.norm.pdf(u)
            ei     = func_s*( u*ncdf + npdf)

            if not compute_grad:
                return ei

            # Gradients of ei w.r.t. mean and variance
            g_ei_m = -ncdf
            g_ei_s2 = 0.5*npdf / func_s

            # Apply covariance function
            grad_cross = np.squeeze(cand_cross_grad)

            grad_xp_m = np.dot(alpha.transpose(),grad_cross)
            grad_xp_v = np.dot(-2*spla.cho_solve(
                    (obsv_chol, True),cand_cross).transpose(), grad_cross)

            grad_xp = 0.5*self.amp2*(grad_xp_m*g_ei_m + grad_xp_v*g_ei_s2)
            ei = -np.sum(ei)

            return ei, grad_xp.flatten()

        else:
            # If there are pending experiments, fantasize their outcomes.
            cand = np.reshape(cand, (-1, comp.shape[1]))

            # Create a composite vector of complete and pending.
            comp_pend = np.concatenate((comp, pend))

            # Compute the covariance and Cholesky decomposition.
            comp_pend_cov  = (self.cov(comp_pend) +
                              self.noise*np.eye(comp_pend.shape[0]))
            comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True)

            # Compute submatrices.
            pend_cross = self.cov(comp, pend)
            pend_kappa = self.cov(pend)

            # Use the sub-Cholesky.
            obsv_chol = comp_pend_chol[:comp.shape[0],:comp.shape[0]]

            # Solve the linear systems.
            alpha  = spla.cho_solve((obsv_chol, True), vals - self.mean)
            beta   = spla.cho_solve((obsv_chol, True), pend_cross)

            # Finding predictive means and variances.
            pend_m = np.dot(pend_cross.T, alpha) + self.mean
            pend_K = pend_kappa - np.dot(pend_cross.T, beta)

            # Take the Cholesky of the predictive covariance.
            pend_chol = spla.cholesky(pend_K, lower=True)

            # Make predictions.
            npr.set_state(self.randomstate)
            pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0],self.pending_samples)) + pend_m[:,None]

            # Include the fantasies.
            fant_vals = np.concatenate(
                (np.tile(vals[:,np.newaxis],
                         (1,self.pending_samples)), pend_fant))

            # Compute bests over the fantasies.
            bests = np.min(fant_vals, axis=0)

            # Now generalize from these fantasies.
            cand_cross = self.cov(comp_pend, cand)
            cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__)
            cand_cross_grad = cov_grad_func(self.ls, comp_pend, cand)

            # Solve the linear systems.
            alpha  = spla.cho_solve((comp_pend_chol, True),
                                    fant_vals - self.mean)
            beta   = spla.solve_triangular(comp_pend_chol, cand_cross,
                                           lower=True)

            # Predict the marginal means and variances at candidates.
            func_m = np.dot(cand_cross.T, alpha) + self.mean
            func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0)

            # Expected improvement
            func_s = np.sqrt(func_v[:,np.newaxis])
            u      = (bests[np.newaxis,:] - func_m) / func_s
            ncdf   = sps.norm.cdf(u)
            npdf   = sps.norm.pdf(u)
            ei     = func_s*( u*ncdf + npdf)

            # Gradients of ei w.r.t. mean and variance
            g_ei_m = -ncdf
            g_ei_s2 = 0.5*npdf / func_s

            # Apply covariance function
            grad_cross = np.squeeze(cand_cross_grad)

            grad_xp_m = np.dot(alpha.transpose(),grad_cross)
            grad_xp_v = np.dot(-2*spla.cho_solve(
                    (comp_pend_chol, True),cand_cross).transpose(), grad_cross)

            grad_xp = 0.5*self.amp2*(grad_xp_m*np.tile(g_ei_m,(comp.shape[1],1)).T + (grad_xp_v.T*g_ei_s2).T)
            ei = -np.mean(ei, axis=1)
            grad_xp = np.mean(grad_xp,axis=0)

            return ei, grad_xp.flatten()

    def compute_ei(self, comp, pend, cand, vals):
        if pend.shape[0] == 0:
            # If there are no pending, don't do anything fancy.

            # Current best.
            best = np.min(vals)

            # The primary covariances for prediction.
            comp_cov   = self.cov(comp)
            cand_cross = self.cov(comp, cand)

            # Compute the required Cholesky.
            obsv_cov  = comp_cov + self.noise*np.eye(comp.shape[0])
            obsv_chol = spla.cholesky( obsv_cov, lower=True )

            # Solve the linear systems.
            alpha  = spla.cho_solve((obsv_chol, True), vals - self.mean)
            beta   = spla.solve_triangular(obsv_chol, cand_cross, lower=True)

            # Predict the marginal means and variances at candidates.
            func_m = np.dot(cand_cross.T, alpha) + self.mean
            func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0)

            # Expected improvement
            func_s = np.sqrt(func_v)
            u      = (best - func_m) / func_s
            ncdf   = sps.norm.cdf(u)
            npdf   = sps.norm.pdf(u)
            ei     = func_s*( u*ncdf + npdf)

            return ei
        else:
            # If there are pending experiments, fantasize their outcomes.

            # Create a composite vector of complete and pending.
            comp_pend = np.concatenate((comp, pend))

            # Compute the covariance and Cholesky decomposition.
            comp_pend_cov  = (self.cov(comp_pend) +
                              self.noise*np.eye(comp_pend.shape[0]))
            comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True)

            # Compute submatrices.
            pend_cross = self.cov(comp, pend)
            pend_kappa = self.cov(pend)

            # Use the sub-Cholesky.
            obsv_chol = comp_pend_chol[:comp.shape[0],:comp.shape[0]]

            # Solve the linear systems.
            alpha  = spla.cho_solve((obsv_chol, True), vals - self.mean)
            beta   = spla.cho_solve((obsv_chol, True), pend_cross)

            # Finding predictive means and variances.
            pend_m = np.dot(pend_cross.T, alpha) + self.mean
            pend_K = pend_kappa - np.dot(pend_cross.T, beta)

            # Take the Cholesky of the predictive covariance.
            pend_chol = spla.cholesky(pend_K, lower=True)

            # Make predictions.
            npr.set_state(self.randomstate)
            pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0],self.pending_samples)) + pend_m[:,None]

            # Include the fantasies.
            fant_vals = np.concatenate(
                (np.tile(vals[:,np.newaxis],
                         (1,self.pending_samples)), pend_fant))

            # Compute bests over the fantasies.
            bests = np.min(fant_vals, axis=0)

            # Now generalize from these fantasies.
            cand_cross = self.cov(comp_pend, cand)

            # Solve the linear systems.
            alpha  = spla.cho_solve((comp_pend_chol, True),
                                    fant_vals - self.mean)
            beta   = spla.solve_triangular(comp_pend_chol, cand_cross,
                                           lower=True)

            # Predict the marginal means and variances at candidates.
            func_m = np.dot(cand_cross.T, alpha) + self.mean
            func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0)

            # Expected improvement
            func_s = np.sqrt(func_v[:,np.newaxis])
            u      = (bests[np.newaxis,:] - func_m) / func_s
            ncdf   = sps.norm.cdf(u)
            npdf   = sps.norm.pdf(u)
            ei     = func_s*( u*ncdf + npdf)

            return np.mean(ei, axis=1)

    def sample_hypers(self, comp, vals):
        if self.noiseless:
            self.noise = 1e-3
            self._sample_noiseless(comp, vals)
        else:
            self._sample_noisy(comp, vals)
        self._sample_ls(comp, vals)
        self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls))

    def _sample_ls(self, comp, vals):
        def logprob(ls):
            if np.any(ls < 0) or np.any(ls > self.max_ls):
                return -np.inf

            cov   = (self.amp2 * (self.cov_func(ls, comp, None) +
                1e-6*np.eye(comp.shape[0])) + self.noise*np.eye(comp.shape[0]))
            chol  = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - self.mean)
            lp    = (-np.sum(np.log(np.diag(chol))) -
                      0.5*np.dot(vals-self.mean, solve))
            return lp

        self.ls = util.slice_sample(self.ls, logprob, compwise=True)

    def _sample_noisy(self, comp, vals):
        def logprob(hypers):
            mean  = hypers[0]
            amp2  = hypers[1]
            noise = hypers[2]

            # This is pretty hacky, but keeps things sane.
            if mean > np.max(vals) or mean < np.min(vals):
                return -np.inf

            if amp2 < 0 or noise < 0:
                return -np.inf

            cov   = (amp2 * (self.cov_func(self.ls, comp, None) +
                1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]))
            chol  = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - mean)
            lp    = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve)

            # Roll in noise horseshoe prior.
            lp += np.log(np.log(1 + (self.noise_scale/noise)**2))

            # Roll in amplitude lognormal prior
            lp -= 0.5*(np.log(np.sqrt(amp2))/self.amp2_scale)**2

            return lp

        hypers = util.slice_sample(np.array(
                [self.mean, self.amp2, self.noise]), logprob, compwise=False)
        self.mean  = hypers[0]
        self.amp2  = hypers[1]
        self.noise = hypers[2]

    def _sample_noiseless(self, comp, vals):
        def logprob(hypers):
            mean  = hypers[0]
            amp2  = hypers[1]
            noise = 1e-3

            # This is pretty hacky, but keeps things sane.
            if mean > np.max(vals) or mean < np.min(vals):
                return -np.inf

            if amp2 < 0:
                return -np.inf

            cov   = (amp2 * (self.cov_func(self.ls, comp, None) +
                1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]))
            chol  = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - mean)
            lp    = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve)

            # Roll in amplitude lognormal prior
            lp -= 0.5*(np.log(np.sqrt(amp2))/self.amp2_scale)**2

            return lp

        hypers = util.slice_sample(np.array(
                [self.mean, self.amp2, self.noise]), logprob, compwise=False)
        self.mean  = hypers[0]
        self.amp2  = hypers[1]
        self.noise = 1e-3

    def optimize_hypers(self, comp, vals):
        mygp = gp.GP(self.cov_func.__name__)
        mygp.real_init(comp.shape[1], vals)
        mygp.optimize_hypers(comp,vals)
        self.mean = mygp.mean
        self.ls = mygp.ls
        self.amp2 = mygp.amp2
        self.noise = mygp.noise

        # Save hyperparameter samples
        self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls))
        self.dump_hypers()

        return
Exemplo n.º 9
0
class GPEIChooser:

    def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10,
                 pending_samples=100, noiseless=False):
        self.cov_func        = getattr(gp, covar)
        self.locker          = Locker()
        self.state_pkl       = os.path.join(expt_dir, self.__module__ + ".pkl")

        self.mcmc_iters      = int(mcmc_iters)
        self.pending_samples = pending_samples
        self.D               = -1
        self.hyper_iters     = 1
        self.noiseless       = bool(int(noiseless))

        self.noise_scale = 0.1  # horseshoe prior
        self.amp2_scale  = 1    # zero-mean log normal prior
        self.max_ls      = 2    # top-hat prior on length scales

    def __del__(self):
        self.locker.lock_wait(self.state_pkl)

        # Write the hyperparameters out to a Pickle.
        fh = tempfile.NamedTemporaryFile(mode='wb', delete=False)
        cPickle.dump({ 'dims'   : self.D,
                       'ls'     : self.ls,
                       'amp2'   : self.amp2,
                       'noise'  : self.noise,
                       'mean'   : self.mean },
                     fh)
        fh.close()

        # Use an atomic move for better NFS happiness.
        cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl)
        os.system(cmd) # TODO: Should check system-dependent return status.

        self.locker.unlock(self.state_pkl)

    def _real_init(self, dims, values):
        self.locker.lock_wait(self.state_pkl)

        if os.path.exists(self.state_pkl):
            fh    = open(self.state_pkl, 'rb')
            state = cPickle.load(fh)
            fh.close()

            self.D     = state['dims']
            self.ls    = state['ls']
            self.amp2  = state['amp2']
            self.noise = state['noise']
            self.mean  = state['mean']
        else:

            # Input dimensionality.
            self.D = dims

            # Initial length scales.
            self.ls = np.ones(self.D)

            # Initial amplitude.
            self.amp2 = np.std(values)+1e-4

            # Initial observation noise.
            self.noise = 1e-3

            # Initial mean.
            self.mean = np.mean(values)

        self.locker.unlock(self.state_pkl)

    def cov(self, x1, x2=None):
        if x2 is None:
            return self.amp2 * (self.cov_func(self.ls, x1, None)
                               + 1e-6*np.eye(x1.shape[0]))
        else:
            return self.amp2 * self.cov_func(self.ls, x1, x2)

    def next(self, grid, values, durations, candidates, pending, complete):

        # Don't bother using fancy GP stuff at first.
        if complete.shape[0] < 2:
            return int(candidates[0])

        # Perform the real initialization.
        if self.D == -1:
            self._real_init(grid.shape[1], values[complete])

        # Grab out the relevant sets.
        comp = grid[complete,:]
        cand = grid[candidates,:]
        pend = grid[pending,:]
        vals = values[complete]

        if self.mcmc_iters > 0:
            # Sample from hyperparameters.

            overall_ei = np.zeros((cand.shape[0], self.mcmc_iters))

            for mcmc_iter in range(self.mcmc_iters):

                self.sample_hypers(comp, vals)
                log("mean: %f  amp: %f  noise: %f  min_ls: %f  max_ls: %f"
                                 % (self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls)))

                overall_ei[:,mcmc_iter] = self.compute_ei(comp, pend, cand, vals)

            best_cand = np.argmax(np.mean(overall_ei, axis=1))
            log(f"Max mean EI: {max(np.mean(overall_ei, axis=1))}")

            return int(candidates[best_cand])

        else:
            # Optimize hyperparameters
            try:
                self.optimize_hypers(comp, vals)
            except:
                # Initial length scales.
                self.ls = np.ones(self.D)
                # Initial amplitude.
                self.amp2 = np.std(vals)
                # Initial observation noise.
                self.noise = 1e-3
            log("mean: %f  amp: %f  noise: %f  min_ls: %f  max_ls: %f"
                             % (self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls),
                                np.max(self.ls)))

            ei = self.compute_ei(comp, pend, cand, vals)

            log(f"Max EI: {max(ei)}")

            best_cand = np.argmax(ei)

            return int(candidates[best_cand])

    def compute_ei(self, comp, pend, cand, vals):
        if pend.shape[0] == 0:
            # If there are no pending, don't do anything fancy.

            # Current best.
            best = np.min(vals)

            # The primary covariances for prediction.
            comp_cov   = self.cov(comp)
            cand_cross = self.cov(comp, cand)

            # Compute the required Cholesky.
            obsv_cov  = comp_cov + self.noise*np.eye(comp.shape[0])
            obsv_chol = spla.cholesky( obsv_cov, lower=True )

            # Solve the linear systems.
            alpha  = spla.cho_solve((obsv_chol, True), vals - self.mean)
            beta   = spla.solve_triangular(obsv_chol, cand_cross, lower=True)

            # Predict the marginal means and variances at candidates.
            func_m = np.dot(cand_cross.T, alpha) + self.mean
            func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0)

            # Expected improvement
            func_s = np.sqrt(func_v)
            u      = (best - func_m) / func_s
            ncdf   = sps.norm.cdf(u)
            npdf   = sps.norm.pdf(u)
            ei     = func_s*( u*ncdf + npdf)

            return ei
        else:
            # If there are pending experiments, fantasize their outcomes.

            # Create a composite vector of complete and pending.
            comp_pend = np.concatenate((comp, pend))

            # Compute the covariance and Cholesky decomposition.
            comp_pend_cov  = self.cov(comp_pend) + self.noise*np.eye(comp_pend.shape[0])
            comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True)

            # Compute submatrices.
            pend_cross = self.cov(comp, pend)
            pend_kappa = self.cov(pend)

            # Use the sub-Cholesky.
            obsv_chol = comp_pend_chol[:comp.shape[0],:comp.shape[0]]

            # Solve the linear systems.
            alpha  = spla.cho_solve((obsv_chol, True), vals - self.mean)
            beta   = spla.cho_solve((obsv_chol, True), pend_cross)

            # Finding predictive means and variances.
            pend_m = np.dot(pend_cross.T, alpha) + self.mean
            pend_K = pend_kappa - np.dot(pend_cross.T, beta)

            # Take the Cholesky of the predictive covariance.
            pend_chol = spla.cholesky(pend_K, lower=True)

            # Make predictions.
            pend_fant = (np.dot(pend_chol, npr.randn(pend.shape[0],self.pending_samples))
                         + pend_m[:,None])

            # Include the fantasies.
            fant_vals = np.concatenate((np.tile(vals[:,np.newaxis],
                                                (1,self.pending_samples)), pend_fant))

            # Compute bests over the fantasies.
            bests = np.min(fant_vals, axis=0)

            # Now generalize from these fantasies.
            cand_cross = self.cov(comp_pend, cand)

            # Solve the linear systems.
            alpha  = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean)
            beta   = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True)

            # Predict the marginal means and variances at candidates.
            func_m = np.dot(cand_cross.T, alpha) + self.mean
            func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0)

            # Expected improvement
            func_s = np.sqrt(func_v[:,np.newaxis])
            u      = (bests[np.newaxis,:] - func_m) / func_s
            ncdf   = sps.norm.cdf(u)
            npdf   = sps.norm.pdf(u)
            ei     = func_s*( u*ncdf + npdf)

            return np.mean(ei, axis=1)

    def sample_hypers(self, comp, vals):
        if self.noiseless:
            self.noise = 1e-3
            self._sample_noiseless(comp, vals)
        else:
            self._sample_noisy(comp, vals)
        self._sample_ls(comp, vals)

    def _sample_ls(self, comp, vals):
        def logprob(ls):
            if np.any(ls < 0) or np.any(ls > self.max_ls):
                return -np.inf

            cov   = self.amp2 * (self.cov_func(ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.noise*np.eye(comp.shape[0])
            chol  = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - self.mean)
            lp    = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-self.mean, solve)
            return lp

        self.ls = util.slice_sample(self.ls, logprob, compwise=True)

    def _sample_noisy(self, comp, vals):
        def logprob(hypers):
            mean  = hypers[0]
            amp2  = hypers[1]
            noise = hypers[2]

            # This is pretty hacky, but keeps things sane.
            if mean > np.max(vals) or mean < np.min(vals):
                return -np.inf

            if amp2 < 0 or noise < 0:
                return -np.inf

            cov   = amp2 * (self.cov_func(self.ls, comp, None) +
                            1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0])
            chol  = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - mean)
            lp    = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve)

            # Roll in noise horseshoe prior.
            lp += np.log(np.log(1 + (self.noise_scale/noise)**2))

            # Roll in amplitude lognormal prior
            lp -= 0.5*(np.log(amp2)/self.amp2_scale)**2

            return lp

        hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]),
                                   logprob, compwise=False)
        self.mean  = hypers[0]
        self.amp2  = hypers[1]
        self.noise = hypers[2]

    def _sample_noiseless(self, comp, vals):
        def logprob(hypers):
            mean  = hypers[0]
            amp2  = hypers[1]
            noise = 1e-3

            if amp2 < 0:
                return -np.inf

            cov   = amp2 * (self.cov_func(self.ls, comp, None) +
                            1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0])
            chol  = spla.cholesky(cov, lower=True)
            solve = spla.cho_solve((chol, True), vals - mean)
            lp    = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve)

            # Roll in amplitude lognormal prior
            lp -= 0.5*(np.log(amp2)/self.amp2_scale)**2

            return lp

        hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob,
                                   compwise=False)
        self.mean  = hypers[0]
        self.amp2  = hypers[1]
        self.noise = 1e-3

    def optimize_hypers(self, comp, vals):
        mygp = gp.GP(self.cov_func.__name__)
        mygp.real_init(comp.shape[1], vals)
        mygp.optimize_hypers(comp,vals)
        self.mean = mygp.mean
        self.ls = mygp.ls
        self.amp2 = mygp.amp2
        self.noise = mygp.noise

        # Save hyperparameter samples
        #self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls))
        #self.dump_hypers()

        return