Exemplo n.º 1
0
def select_next_points_botorch(observed_X: List[List[float]],
                               observed_y: List[float]) -> np.ndarray:
    """Generate the next sample to evaluate with XTB

    Uses BOTorch to pick the next sample using Expected Improvement

    Args:
        observed_X: Observed coordinates
        observed_y: Observed energies
    Returns:
        Next coordinates to try
    """

    # Clip the energies if needed
    observed_y = np.clip(observed_y, -np.inf,
                         2 + np.log10(np.clip(observed_y, 1, np.inf)))

    # Convert inputs to torch arrays
    train_X = torch.tensor(observed_X, dtype=torch.float)
    train_y = torch.tensor(observed_y, dtype=torch.float)
    train_y = train_y[:, None]
    train_y = standardize(-1 * train_y)

    # Make the GP
    gp = SingleTaskGP(train_X,
                      train_y,
                      covar_module=gpykernels.ScaleKernel(
                          gpykernels.ProductStructureKernel(
                              num_dims=train_X.shape[1],
                              base_kernel=gpykernels.PeriodicKernel(
                                  period_length_prior=NormalPrior(360, 0.1)))))
    mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
    fit_gpytorch_model(mll)

    # Solve the optimization problem
    #  Following boss, we use Eq. 5 of https://arxiv.org/pdf/1012.2599.pdf with delta=0.1
    n_sampled, n_dim = train_X.shape
    kappa = np.sqrt(
        2 *
        np.log10(np.power(n_sampled, n_dim / 2 + 2) * np.pi**2 /
                 (3.0 * 0.1)))  # Results in more exploration over time
    ei = UpperConfidenceBound(gp, kappa)
    bounds = torch.zeros(2, train_X.shape[1])
    bounds[1, :] = 360
    candidate, acq_value = optimize_acqf(ei,
                                         bounds=bounds,
                                         q=1,
                                         num_restarts=64,
                                         raw_samples=64)
    return candidate.detach().numpy()[0, :]
Exemplo n.º 2
0
    def initialize_model(self, train_X, train_Y, state_dict=None):
        """Initialise model for BO."""
        # From: https://github.com/pytorch/botorch/issues/179
        noise_prior = GammaPrior(1.1, 0.05)
        noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate
        MIN_INFERRED_NOISE_LEVEL = 1e-3
        likelihood = GaussianLikelihood(
            noise_prior=noise_prior,
            noise_constraint=GreaterThan(
                MIN_INFERRED_NOISE_LEVEL,
                transform=None,
                initial_value=noise_prior_mode,
            ),
        )

        # train_x = self.scale_to_0_1_bounds(train_X)
        train_Y = standardize(train_Y)
        gp = SingleTaskGP(train_X, train_Y, likelihood=likelihood)
        mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
        # load state dict if it is passed
        if state_dict is not None:
            gp.load_state_dict(state_dict)
        return mll, gp
Exemplo n.º 3
0
def main(
        benchmark_name,
        dataset_name,
        dimensions,
        method_name,
        num_runs,
        run_start,
        num_iterations,
        acquisition_name,
        # acquisition_optimizer_name,
        gamma,
        num_random_init,
        mc_samples,
        batch_size,
        num_fantasies,
        num_restarts,
        raw_samples,
        noise_variance_init,
        # use_ard,
        # use_input_warping,
        standardize_targets,
        input_dir,
        output_dir):

    # TODO(LT): Turn into options
    # device = "cpu"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dtype = torch.double

    benchmark = make_benchmark(benchmark_name,
                               dimensions=dimensions,
                               dataset_name=dataset_name,
                               input_dir=input_dir)
    name = make_name(benchmark_name,
                     dimensions=dimensions,
                     dataset_name=dataset_name)

    output_path = Path(output_dir).joinpath(name, method_name)
    output_path.mkdir(parents=True, exist_ok=True)

    options = dict(gamma=gamma,
                   num_random_init=num_random_init,
                   acquisition_name=acquisition_name,
                   mc_samples=mc_samples,
                   batch_size=batch_size,
                   num_restarts=num_restarts,
                   raw_samples=raw_samples,
                   num_fantasies=num_fantasies,
                   noise_variance_init=noise_variance_init,
                   standardize_targets=standardize_targets)
    with output_path.joinpath("options.yaml").open('w') as f:
        yaml.dump(options, f)

    config_space = DenseConfigurationSpace(benchmark.get_config_space())
    bounds = create_bounds(config_space.get_bounds(),
                           device=device,
                           dtype=dtype)
    input_dim = config_space.get_dimensions()

    def func(tensor, *args, **kwargs):
        """
        Wrapper that receives and returns torch.Tensor
        """
        config = dict_from_tensor(tensor, cs=config_space)
        # turn into maximization problem
        res = -benchmark.evaluate(config).value
        return torch.tensor(res, device=device, dtype=dtype)

    for run_id in trange(run_start, num_runs, unit="run"):

        run_begin_t = batch_end_t_adj = batch_end_t = datetime.now()

        frames = []

        features = []
        targets = []

        noise_variance = torch.tensor(noise_variance_init,
                                      device=device,
                                      dtype=dtype)
        state_dict = None

        with trange(num_iterations) as iterations:

            for batch in iterations:

                if len(targets) < num_random_init:
                    # click.echo(f"Completed {i}/{num_random_init} initial runs. "
                    #            "Suggesting random candidate...")
                    # TODO(LT): support random seed
                    X_batch = torch.rand(size=(batch_size, input_dim),
                                         device=device,
                                         dtype=dtype)
                else:

                    # construct dataset
                    X = torch.vstack(features)
                    y = torch.hstack(targets).unsqueeze(axis=-1)
                    y = standardize(y) if standardize_targets else y

                    # construct model
                    # model = FixedNoiseGP(X, standardize(y), noise_variance.expand_as(y),
                    model = FixedNoiseGP(X,
                                         y,
                                         noise_variance.expand_as(y),
                                         input_transform=None).to(X)
                    mll = ExactMarginalLogLikelihood(model.likelihood, model)

                    if state_dict is not None:
                        model.load_state_dict(state_dict)

                    # update model
                    fit_gpytorch_model(mll)

                    # construct acquisition function
                    tau = torch.quantile(y, q=1 - gamma)
                    iterations.set_postfix(tau=tau.item())

                    if acquisition_name == "q-KG":
                        assert num_fantasies is not None and num_fantasies > 0
                        acq = qKnowledgeGradient(model,
                                                 num_fantasies=num_fantasies)
                    elif acquisition_name == "q-EI":
                        assert mc_samples is not None and mc_samples > 0
                        qmc_sampler = SobolQMCNormalSampler(
                            num_samples=mc_samples)
                        acq = qExpectedImprovement(model=model,
                                                   best_f=tau,
                                                   sampler=qmc_sampler)

                    # optimize acquisition function
                    X_batch, b = optimize_acqf(acq_function=acq,
                                               bounds=bounds,
                                               q=batch_size,
                                               num_restarts=num_restarts,
                                               raw_samples=raw_samples,
                                               options=dict(batch_limit=5,
                                                            maxiter=200))

                    state_dict = model.state_dict()

                # begin batch evaluation
                batch_begin_t = datetime.now()
                decision_duration = batch_begin_t - batch_end_t
                batch_begin_t_adj = batch_end_t_adj + decision_duration

                eval_end_times = []

                # TODO(LT): Deliberately not doing broadcasting for now since
                # batch sizes are so small anyway. Can revisit later if there
                # is a compelling reason to do it.
                rows = []
                for j, x_next in enumerate(X_batch):

                    # eval begin time
                    eval_begin_t = datetime.now()

                    # evaluate blackbox objective
                    y_next = func(x_next)

                    # eval end time
                    eval_end_t = datetime.now()

                    # eval duration
                    eval_duration = eval_end_t - eval_begin_t

                    # adjusted eval end time is the duration added to the
                    # time at which batch eval was started
                    eval_end_t_adj = batch_begin_t_adj + eval_duration

                    eval_end_times.append(eval_end_t_adj)
                    elapsed = eval_end_t_adj - run_begin_t

                    # update dataset
                    features.append(x_next)
                    targets.append(y_next)

                    row = dict_from_tensor(x_next, cs=config_space)
                    row["loss"] = -y_next.item()
                    row["cost_eval"] = eval_duration.total_seconds()
                    row["finished"] = elapsed.total_seconds()
                    rows.append(row)

                batch_end_t = datetime.now()
                batch_end_t_adj = max(eval_end_times)

                frame = pd.DataFrame(data=rows) \
                          .assign(batch=batch,
                                  cost_decision=decision_duration.total_seconds())
                frames.append(frame)

        data = pd.concat(frames, axis="index", ignore_index=True)
        data.to_csv(output_path.joinpath(f"{run_id:03d}.csv"))

    return 0
Exemplo n.º 4
0
 def generate_outer_restart_points(self,
                                   acqf: ApxCVaRKG,
                                   w_samples: Tensor = None) -> Tensor:
     """
     Generates the restart points for acqf optimization.
     :param acqf: The acquisition function being optimized
     :param w_samples: the list of w samples to use
     :return: restart points
     """
     X = draw_constrained_sobol(
         bounds=self.outer_bounds,
         n=self.raw_samples,
         q=self.q,
         inequality_constraints=self.inequality_constraints,
     ).to(dtype=self.dtype, device=self.device)
     # get the optimizers of the inner problem
     if w_samples is None:
         w_samples = (acqf.fixed_samples if acqf.fixed_samples is not None
                      else torch.rand(acqf.num_samples,
                                      acqf.dim_w,
                                      dtype=self.dtype,
                                      device=self.device))
     inner_rho = InnerRho(
         model=acqf.model,
         w_samples=w_samples,
         alpha=acqf.alpha,
         dim_x=acqf.dim_x,
         num_repetitions=acqf.num_repetitions,
         inner_seed=acqf.inner_seed,
         CVaR=acqf.CVaR,
         expectation=acqf.expectation,
         weights=getattr(acqf, "weights", None),
     )
     inner_solutions, inner_values = super().optimize_inner(
         inner_rho, False)
     # sample from the optimizers
     n_value = int((1 - self.random_frac) * self.num_fantasies)
     weights = torch.exp(self.eta * standardize(inner_values))
     idx = torch.multinomial(weights,
                             self.raw_samples * n_value,
                             replacement=True)
     # set the respective raw samples to the sampled optimizers
     # we first get the corresponding beta values and merge them with sampled
     # optimizers. this avoids the need for complicated indexing
     betas = X[...,
               self.beta_idcs][...,
                               -n_value:].reshape(self.raw_samples, -1, 1)
     X[..., -n_value * (self.dim_x + 1):] = torch.cat(
         [
             inner_solutions[idx, 0].reshape(self.raw_samples, n_value,
                                             self.dim_x),
             betas,
         ],
         dim=-1,
     ).view(self.raw_samples, 1, -1)
     if w_samples is not None:
         w_ind = torch.randint(w_samples.shape[0],
                               (self.raw_samples, self.q))
         if self.q > 1:
             raise NotImplementedError("This does not support q>1!")
         X[..., self.dim_x:self.dim] = w_samples[w_ind, :]
     return self.generate_restart_points_from_samples(X, acqf)
Exemplo n.º 5
0
    def sample_arch(self, START_BO, g, steps, hyperparams, og_flops, full_val_loss, target_flops=0):
        if args.slim:
            if target_flops == 0:
                parameterization = hyperparams.random_sample()
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
            else:
                parameterization = np.ones(hyperparams.get_dim()) * args.lower_channel
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        else:
            # random sample to warmup history for MOBO
            if g < START_BO:
                if target_flops == 0:
                    f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel
                else:
                    f = args.lower_channel
                parameterization = np.ones(hyperparams.get_dim()) * f
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
            # put the largest model into the history
            elif g == START_BO:
                if target_flops == 0:
                    parameterization = np.ones(hyperparams.get_dim())
                else:
                    f = args.lower_channel
                    parameterization = np.ones(hyperparams.get_dim()) * f
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
            # MOBO
            else:
                # this is the scalarization (lambda_{FLOPs})
                rand = torch.rand(1).cuda()

                # standardize data for building Gaussian Processes
                train_X = torch.FloatTensor(self.X).cuda()
                train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda()
                train_Y_loss = standardize(train_Y_loss)

                train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda()
                train_Y_cost = standardize(train_Y_cost)

                new_train_X = train_X
                # GP for the cross entropy loss
                gp_loss = SingleTaskGP(new_train_X, train_Y_loss)
                mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss)
                mll = mll.to('cuda')
                fit_gpytorch_model(mll)


                # GP for FLOPs
                # we use add-gp since FLOPs has addive structure (not exactly though)
                # the parameters for ScaleKernel and MaternKernel simply follow the default
                covar_module = AdditiveStructureKernel(
                    ScaleKernel(
                        MaternKernel(
                            nu=2.5,
                            lengthscale_prior=GammaPrior(3.0, 6.0),
                            num_dims=1
                        ),
                        outputscale_prior=GammaPrior(2.0, 0.15),
                    ),
                    num_dims=train_X.shape[1]
                )
                gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module)
                mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost)
                mll = mll.to('cuda')
                fit_gpytorch_model(mll)

                # Build acquisition functions
                UCB_loss = UpperConfidenceBound(gp_loss, beta=0.1).cuda()
                UCB_cost = UpperConfidenceBound(gp_cost, beta=0.1).cuda()

                # Combine them via augmented Tchebyshev scalarization
                self.mobo_obj = RandAcquisition(UCB_loss).cuda()
                self.mobo_obj.setup(UCB_loss, UCB_cost, rand)

                # Bounds for the optimization variable (alpha)
                lower = torch.ones(new_train_X.shape[1])*args.lower_channel
                upper = torch.ones(new_train_X.shape[1])*args.upper_channel
                self.mobo_bounds = torch.stack([lower, upper]).cuda()

                # Pareto-aware sampling
                if args.pas:
                    # Generate approximate Pareto front first
                    costs = []
                    for i in range(len(self.population_data)):
                        costs.append([self.population_data[i]['loss'], self.population_data[i]['ratio']])
                    costs = np.array(costs)
                    efficient_mask = is_pareto_efficient(costs)
                    costs = costs[efficient_mask]
                    loss = costs[:, 0]
                    flops = costs[:, 1]
                    sorted_idx = np.argsort(flops)
                    loss = loss[sorted_idx]
                    flops = flops[sorted_idx]
                    if flops[0] > args.lower_flops:
                        flops = np.concatenate([[args.lower_flops], flops.reshape(-1)])
                        loss = np.concatenate([[8], loss.reshape(-1)])
                    else:
                        flops = flops.reshape(-1)
                        loss = loss.reshape(-1)

                    if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss):
                        flops = np.concatenate([flops.reshape(-1), [args.upper_flops]])
                        loss = np.concatenate([loss.reshape(-1), [full_val_loss]])
                    else:
                        flops = flops.reshape(-1)
                        loss = loss.reshape(-1)

                    # Equation (4) in paper
                    areas = (flops[1:]-flops[:-1])*(loss[:-1]-loss[1:])

                    # Quantize into 50 bins to sample from multinomial
                    self.sampling_weights = np.zeros(50)
                    k = 0
                    while k < len(flops) and flops[k] < args.lower_flops:
                        k+=1
                    for i in range(50):
                        lower = i/50.
                        upper = (i+1)/50.
                        if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops:
                            continue
                        cnt = 1
                        while ((k+1) < len(flops)) and upper > flops[k+1]:
                            self.sampling_weights[i] += areas[k]
                            cnt += 1
                            k += 1
                        if k < len(areas):
                            self.sampling_weights[i] += areas[k]
                        self.sampling_weights[i] /= cnt
                    if np.sum(self.sampling_weights) == 0:
                        self.sampling_weights = np.ones(50)
                        
                    if target_flops == 0:
                        val = np.arange(0.01, 1, 0.02)
                        chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights)))
                    else:
                        chosen_target_flops = target_flops
                    
                    # Binary search is here
                    lower_bnd, upper_bnd = 0, 1
                    lmda = 0.5
                    for i in range(10):
                        self.mobo_obj.rand = lmda

                        parameterization, acq_value = optimize_acqf(
                            self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                        )

                        parameterization = parameterization[0].cpu().numpy()
                        layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
                        sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget)
                        ratio = sim_flops/og_flops

                        if np.abs(ratio - chosen_target_flops) <= 0.02:
                            break
                        if args.baseline > 0:
                            if ratio < chosen_target_flops:
                                lower_bnd = lmda
                                lmda = (lmda + upper_bnd) / 2
                            elif ratio > chosen_target_flops:
                                upper_bnd = lmda
                                lmda = (lmda + lower_bnd) / 2
                        else:
                            if ratio < chosen_target_flops:
                                upper_bnd = lmda
                                lmda = (lmda + lower_bnd) / 2
                            elif ratio > chosen_target_flops:
                                lower_bnd = lmda
                                lmda = (lmda + upper_bnd) / 2
                    rand[0] = lmda
                    writer.add_scalar('Binary search trials', i, steps)

                else:
                    parameterization, acq_value = optimize_acqf(
                        self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                    )
                    parameterization = parameterization[0].cpu().numpy()

                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
Exemplo n.º 6
0
# excursionsearch.  If not, see <http://www.gnu.org/licenses/>.
#
#
import torch
from botorch.models import SingleTaskGP
from botorch.fit import fit_gpytorch_model
from botorch.utils import standardize
from gpytorch.mlls import ExactMarginalLogLikelihood
from botorch.acquisition import UpperConfidenceBound
from botorch.optim import optimize_acqf

# Training data:
train_X = torch.rand(10, 2)
Y = 1 - torch.norm(train_X - 0.5, dim=-1, keepdim=True)
Y = Y + 0.1 * torch.randn_like(Y)  # add some noise
train_Y = standardize(Y)

# Fir the model:
gp = SingleTaskGP(train_X, train_Y)
mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
fit_gpytorch_model(mll)

print(mll)

# Construct acquisition function:
UCB = UpperConfidenceBound(gp, beta=0.1)

print(UCB)

bounds = torch.stack([torch.zeros(2), torch.ones(2)])
candidate, acq_value = optimize_acqf(
Exemplo n.º 7
0
    def sample_arch(self, START_BO, g, hyperparams, og_flops, empty_val_loss, full_val_loss, target_flops=0):
        if g < START_BO:
            if target_flops == 0:
                f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel
            else:
                f = args.lower_channel
            parameterization = np.ones(hyperparams.get_dim()) * f
            layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        elif g == START_BO:
            if target_flops == 0:
                parameterization = np.ones(hyperparams.get_dim())
            else:
                f = args.lower_channel
                parameterization = np.ones(hyperparams.get_dim()) * f
            layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        else:
            rand = torch.rand(1).cuda()

            train_X = torch.FloatTensor(self.X).cuda()
            train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda()
            train_Y_loss = standardize(train_Y_loss)

            train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda()
            train_Y_cost = standardize(train_Y_cost)

            covar_module = None
            if args.ski and g > 128:
                if args.additive:
                    covar_module = AdditiveStructureKernel(
                        ScaleKernel(
                            GridInterpolationKernel(
                                MaternKernel(
                                    nu=2.5,
                                    lengthscale_prior=GammaPrior(3.0, 6.0),
                                ),
                                grid_size=128, num_dims=1, grid_bounds=[(0, 1)]
                            ),
                            outputscale_prior=GammaPrior(2.0, 0.15),
                        ), 
                        num_dims=train_X.shape[1]
                    )
                else:
                    covar_module = ScaleKernel(
                        GridInterpolationKernel(
                            MaternKernel(
                                nu=2.5,
                                lengthscale_prior=GammaPrior(3.0, 6.0),
                            ),
                            grid_size=128, num_dims=train_X.shape[1], grid_bounds=[(0, 1) for _ in range(train_X.shape[1])]
                        ),
                        outputscale_prior=GammaPrior(2.0, 0.15),
                    )
            else:
                if args.additive:
                    covar_module = AdditiveStructureKernel(
                        ScaleKernel(
                            MaternKernel(
                                nu=2.5,
                                lengthscale_prior=GammaPrior(3.0, 6.0),
                                num_dims=1
                            ),
                            outputscale_prior=GammaPrior(2.0, 0.15),
                        ),
                        num_dims=train_X.shape[1]
                    )
                else:
                    covar_module = ScaleKernel(
                        MaternKernel(
                            nu=2.5,
                            lengthscale_prior=GammaPrior(3.0, 6.0),
                            num_dims=train_X.shape[1]
                        ),
                        outputscale_prior=GammaPrior(2.0, 0.15),
                    )

            new_train_X = train_X
            gp_loss = SingleTaskGP(new_train_X, train_Y_loss, covar_module=covar_module)
            mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss)
            mll = mll.to('cuda')
            fit_gpytorch_model(mll)


            # Use add-gp for cost
            covar_module = AdditiveStructureKernel(
                ScaleKernel(
                    MaternKernel(
                        nu=2.5,
                        lengthscale_prior=GammaPrior(3.0, 6.0),
                        num_dims=1
                    ),
                    outputscale_prior=GammaPrior(2.0, 0.15),
                ),
                num_dims=train_X.shape[1]
            )
            gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module)
            mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost)
            mll = mll.to('cuda')
            fit_gpytorch_model(mll)

            UCB_loss = UpperConfidenceBound(gp_loss, beta=args.beta).cuda()
            UCB_cost = UpperConfidenceBound(gp_cost, beta=args.beta).cuda()
            self.mobo_obj = RandAcquisition(UCB_loss).cuda()
            self.mobo_obj.setup(UCB_loss, UCB_cost, rand)

            lower = torch.ones(new_train_X.shape[1])*args.lower_channel
            upper = torch.ones(new_train_X.shape[1])*args.upper_channel
            self.mobo_bounds = torch.stack([lower, upper]).cuda()

            if args.pas:
                val = np.linspace(args.lower_flops, 1, 50)
                chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights)))
                
                lower_bnd, upper_bnd = 0, 1
                lmda = 0.5
                for i in range(10):
                    self.mobo_obj.rand = lmda

                    parameterization, acq_value = optimize_acqf(
                        self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                    )

                    parameterization = parameterization[0].cpu().numpy()
                    layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
                    sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget, self.use_mem)
                    ratio = sim_flops/og_flops

                    if np.abs(ratio - chosen_target_flops) <= 0.02:
                        break
                    if args.baseline > 0:
                        if ratio < chosen_target_flops:
                            lower_bnd = lmda
                            lmda = (lmda + upper_bnd) / 2
                        elif ratio > chosen_target_flops:
                            upper_bnd = lmda
                            lmda = (lmda + lower_bnd) / 2
                    else:
                        if ratio < chosen_target_flops:
                            upper_bnd = lmda
                            lmda = (lmda + lower_bnd) / 2
                        elif ratio > chosen_target_flops:
                            lower_bnd = lmda
                            lmda = (lmda + upper_bnd) / 2
                rand[0] = lmda
                writer.add_scalar('Binary search trials', i, g)

            else:
                parameterization, acq_value = optimize_acqf(
                    self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                )
                parameterization = parameterization[0].cpu().numpy()

            layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
Exemplo n.º 8
0
def main(benchmark_name, dataset_name, dimensions, method_name, num_runs,
         run_start, num_iterations,
         # acquisition_name,
         # acquisition_optimizer_name,
         gamma, num_random_init,
         num_restarts, raw_samples, noise_variance_init,
         # use_ard,
         # use_input_warping,
         standardize_targets,
         input_dir, output_dir):

    # TODO(LT): Turn into options
    # device = "cpu"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dtype = torch.double

    benchmark = make_benchmark(benchmark_name,
                               dimensions=dimensions,
                               dataset_name=dataset_name,
                               input_dir=input_dir)
    name = make_name(benchmark_name,
                     dimensions=dimensions,
                     dataset_name=dataset_name)

    output_path = Path(output_dir).joinpath(name, method_name)
    output_path.mkdir(parents=True, exist_ok=True)

    options = dict(gamma=gamma, num_random_init=num_random_init,
                   num_restarts=num_restarts, raw_samples=raw_samples,
                   noise_variance_init=noise_variance_init,
                   standardize_targets=standardize_targets)
    with output_path.joinpath("options.yaml").open('w') as f:
        yaml.dump(options, f)

    config_space = DenseConfigurationSpace(benchmark.get_config_space())
    bounds = create_bounds(config_space.get_bounds(), device=device, dtype=dtype)
    input_dim = config_space.get_dimensions()

    def func(tensor, *args, **kwargs):
        """
        Wrapper that receives and returns torch.Tensor
        """
        config = dict_from_tensor(tensor, cs=config_space)
        # turn into maximization problem
        res = - benchmark.evaluate(config).value
        return torch.tensor(res, device=device, dtype=dtype)

    for run_id in trange(run_start, num_runs, unit="run"):

        t_start = datetime.now()

        rows = []

        features = []
        targets = []

        noise_variance = torch.tensor(noise_variance_init, device=device, dtype=dtype)
        state_dict = None

        with trange(num_iterations) as iterations:

            for i in iterations:

                if len(targets) < num_random_init:
                    # click.echo(f"Completed {i}/{num_random_init} initial runs. "
                    #            "Suggesting random candidate...")
                    # TODO(LT): support random seed
                    x_new = torch.rand(size=(input_dim,), device=device, dtype=dtype)
                else:

                    # construct dataset
                    X = torch.vstack(features)
                    y = torch.hstack(targets).unsqueeze(axis=-1)
                    y = standardize(y) if standardize_targets else y

                    # construct model
                    # model = FixedNoiseGP(X, standardize(y), noise_variance.expand_as(y),
                    model = FixedNoiseGP(X, y, noise_variance.expand_as(y),
                                         input_transform=None).to(X)
                    mll = ExactMarginalLogLikelihood(model.likelihood, model)

                    if state_dict is not None:
                        model.load_state_dict(state_dict)

                    # update model
                    fit_gpytorch_model(mll)

                    # construct acquisition function
                    tau = torch.quantile(y, q=1-gamma)
                    iterations.set_postfix(tau=tau.item())

                    ei = ExpectedImprovement(model=model, best_f=tau)

                    # optimize acquisition function
                    X_batch, b = optimize_acqf(acq_function=ei, bounds=bounds,
                                               q=1,
                                               num_restarts=num_restarts,
                                               raw_samples=raw_samples,
                                               options=dict(batch_limit=5,
                                                            maxiter=200))
                    x_new = X_batch.squeeze(axis=0)

                    state_dict = model.state_dict()

                # evaluate blackbox objective
                # t0 = datetime.now()
                y_new = func(x_new)
                t1 = datetime.now()

                delta = t1 - t_start

                # update dataset
                features.append(x_new)
                targets.append(y_new)

                row = dict_from_tensor(x_new, cs=config_space)
                row["loss"] = - y_new.item()
                row["finished"] = delta.total_seconds()
                rows.append(row)

        data = pd.DataFrame(data=rows)
        data.to_csv(output_path.joinpath(f"{run_id:03d}.csv"))

    return 0
Exemplo n.º 9
0
    def observe(self, X, y):
        """Send an observation of a suggestion back to the optimizer.
        Parameters
        ----------
        X : list of dict-like
            Places where the objective function has already been evaluated.
            Each suggestion is a dictionary where each key corresponds to a
            parameter being optimized.
        y : array-like, shape (n,)
            Corresponding values where objective has been evaluated
        """
        try:
            assert len(X) == len(y)
            c = 0

            for x_, y_ in zip(X, y):
                # Archive stores all the solutions
                self.archive.append(x_)
                self.arc_fitness.append(
                    -y_)  # As BoTorch solves a maximization problem

                if self.iter == 1:
                    self.population.append(x_)
                    self.fitness.append(y_)
                else:
                    if y_ <= self.fitness[c]:
                        self.population[c] = x_
                        self.fitness[c] = y_

                    c += 1

                # Just ignore, any inf observations we got, unclear if right thing
                if np.isfinite(y_):
                    self._observe(x_, y_)

            # Transform the data (seen till now) into tensors and train the model
            train_x = normalize(torch.from_numpy(
                self.search_space.warp(self.archive)),
                                bounds=self.torch_bounds)
            train_y = standardize(
                torch.from_numpy(
                    np.array(self.arc_fitness).reshape(len(self.arc_fitness),
                                                       1)))
            # Fit the GP based on the actual observed values
            if self.iter == 1:
                self.model, mll = self.make_model(train_x, train_y)
            else:
                self.model, mll = self.make_model(train_x, train_y,
                                                  self.model.state_dict())

            # mll.train()
            fit_gpytorch_model(mll)

            # define the sampler
            sampler = SobolQMCNormalSampler(num_samples=512)

            # define the acquisition function
            self.acquisition = qExpectedImprovement(model=self.model,
                                                    best_f=train_y.max(),
                                                    sampler=sampler)

        except Exception as e:
            print('Error: {} in observe()'.format(e))
Exemplo n.º 10
0
    def sample_arch(self,
                    START_BO,
                    g,
                    hyperparams,
                    og_flops,
                    empty_val_loss,
                    full_val_loss,
                    target_flops=0):
        # Warming up the history with a single width-multiplier
        if g < START_BO:
            if target_flops == 0:
                f = np.random.rand(1) * (args.upper_channel - args.
                                         lower_channel) + args.lower_channel
            else:
                f = args.lower_channel
            parameterization = np.ones(hyperparams.get_dim()) * f
            layer_budget = hyperparams.get_layer_budget_from_parameterization(
                parameterization, self.mask_pruner)
        # Put largest model into the history
        elif g == START_BO:
            if target_flops == 0:
                parameterization = np.ones(hyperparams.get_dim())
            else:
                f = args.lower_channel
                parameterization = np.ones(hyperparams.get_dim()) * f
            layer_budget = hyperparams.get_layer_budget_from_parameterization(
                parameterization, self.mask_pruner)
        # MOBO-RS
        else:
            rand = torch.rand(1).cuda()

            train_X = torch.FloatTensor(self.X).cuda()
            train_Y_loss = torch.FloatTensor(
                np.array(self.Y)[:, 0].reshape(-1, 1)).cuda()
            train_Y_loss = standardize(train_Y_loss)

            train_Y_cost = torch.FloatTensor(
                np.array(self.Y)[:, 1].reshape(-1, 1)).cuda()
            train_Y_cost = standardize(train_Y_cost)

            new_train_X = train_X
            gp_loss = SingleTaskGP(new_train_X, train_Y_loss)
            mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss)
            mll = mll.to('cuda')
            fit_gpytorch_model(mll)

            # Use add-gp for cost
            covar_module = AdditiveStructureKernel(ScaleKernel(
                MaternKernel(nu=2.5,
                             lengthscale_prior=GammaPrior(3.0, 6.0),
                             num_dims=1),
                outputscale_prior=GammaPrior(2.0, 0.15),
            ),
                                                   num_dims=train_X.shape[1])
            gp_cost = SingleTaskGP(new_train_X,
                                   train_Y_cost,
                                   covar_module=covar_module)
            mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost)
            mll = mll.to('cuda')
            fit_gpytorch_model(mll)

            UCB_loss = UpperConfidenceBound(gp_loss).cuda()
            UCB_cost = UpperConfidenceBound(gp_cost).cuda()
            self.mobo_obj = RandAcquisition(UCB_loss).cuda()
            self.mobo_obj.setup(UCB_loss, UCB_cost, rand)

            lower = torch.ones(new_train_X.shape[1]) * args.lower_channel
            upper = torch.ones(new_train_X.shape[1]) * args.upper_channel
            self.mobo_bounds = torch.stack([lower, upper]).cuda()

            if args.pas:
                costs = []
                for i in range(len(self.population_data)):
                    costs.append([
                        self.population_data[i]['loss'],
                        self.population_data[i]['ratio']
                    ])
                costs = np.array(costs)
                efficient_mask = is_pareto_efficient(costs)
                costs = costs[efficient_mask]
                loss = costs[:, 0]
                flops = costs[:, 1]
                sorted_idx = np.argsort(flops)
                loss = loss[sorted_idx]
                flops = flops[sorted_idx]
                if flops[0] > args.lower_flops:
                    flops = np.concatenate([[args.lower_flops],
                                            flops.reshape(-1)])
                    loss = np.concatenate([[empty_val_loss], loss.reshape(-1)])
                else:
                    flops = flops.reshape(-1)
                    loss = loss.reshape(-1)

                if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss):
                    flops = np.concatenate(
                        [flops.reshape(-1), [args.upper_flops]])
                    loss = np.concatenate([loss.reshape(-1), [full_val_loss]])
                else:
                    flops = flops.reshape(-1)
                    loss = loss.reshape(-1)

                areas = (flops[1:] - flops[:-1]) * (loss[:-1] - loss[1:])

                self.sampling_weights = np.zeros(50)
                k = 0
                while k < len(flops) and flops[k] < args.lower_flops:
                    k += 1
                for i in range(50):
                    lower = i / 50.
                    upper = (i + 1) / 50.
                    if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops:
                        continue
                    cnt = 1
                    while ((k + 1) < len(flops)) and upper > flops[k + 1]:
                        self.sampling_weights[i] += areas[k]
                        cnt += 1
                        k += 1
                    if k < len(areas):
                        self.sampling_weights[i] += areas[k]
                    self.sampling_weights[i] /= cnt
                if np.sum(self.sampling_weights) == 0:
                    self.sampling_weights = np.ones(50)

                if target_flops == 0:
                    val = np.arange(0.01, 1, 0.02)
                    chosen_target_flops = np.random.choice(
                        val,
                        p=(self.sampling_weights /
                           np.sum(self.sampling_weights)))
                else:
                    chosen_target_flops = target_flops

                lower_bnd, upper_bnd = 0, 1
                lmda = 0.5
                for i in range(10):
                    self.mobo_obj.rand = lmda

                    parameterization, acq_value = optimize_acqf(
                        self.mobo_obj,
                        bounds=self.mobo_bounds,
                        q=1,
                        num_restarts=5,
                        raw_samples=1000,
                    )

                    parameterization = parameterization[0].cpu().numpy()
                    layer_budget = hyperparams.get_layer_budget_from_parameterization(
                        parameterization, self.mask_pruner)
                    sim_flops = self.mask_pruner.simulate_and_count_flops(
                        layer_budget)
                    ratio = sim_flops / og_flops

                    if np.abs(ratio - chosen_target_flops) <= 0.02:
                        break
                    if args.baseline > 0:
                        if ratio < chosen_target_flops:
                            lower_bnd = lmda
                            lmda = (lmda + upper_bnd) / 2
                        elif ratio > chosen_target_flops:
                            upper_bnd = lmda
                            lmda = (lmda + lower_bnd) / 2
                    else:
                        if ratio < chosen_target_flops:
                            upper_bnd = lmda
                            lmda = (lmda + lower_bnd) / 2
                        elif ratio > chosen_target_flops:
                            lower_bnd = lmda
                            lmda = (lmda + upper_bnd) / 2
                rand[0] = lmda
                writer.add_scalar('Binary search trials', i, g)

            else:
                parameterization, acq_value = optimize_acqf(
                    self.mobo_obj,
                    bounds=self.mobo_bounds,
                    q=1,
                    num_restarts=5,
                    raw_samples=1000,
                )
                parameterization = parameterization[0].cpu().numpy()

            layer_budget = hyperparams.get_layer_budget_from_parameterization(
                parameterization, self.mask_pruner)
        return layer_budget, parameterization, self.sampling_weights / np.sum(
            self.sampling_weights)