예제 #1
0
    def test_inst_no_feat(self):
        ''' test if scenarios are treated correctly if no features are
        specified.'''
        scen = Scenario(self.scen_fn,
                        cmd_options={
                            'run_obj': 'quality',
                            'train_insts': self.train_insts,
                            'test_insts': self.test_insts
                        })
        self.assertTrue(scen.feature_array is None)
        self.assertEqual(len(scen.feature_dict), 0)

        scen.instance_specific = self.inst_specs
        validator = Validator(scen, self.trajectory, self.rng)
        # Add a few runs and check, if they are correctly processed
        old_configs = [entry["incumbent"] for entry in self.trajectory]
        old_rh = RunHistory()
        for config in old_configs[:int(len(old_configs) / 2)]:
            old_rh.add(config,
                       1,
                       1,
                       StatusType.SUCCESS,
                       instance_id='0',
                       seed=127)
        rh = validator.validate_epm('all', 'train+test', 1, old_rh)
        self.assertEqual(len(old_rh.get_all_configs()), 4)
        self.assertEqual(len(rh.get_all_configs()), 10)
예제 #2
0
    def _get_initial_points(
        self,
        num_points: int,
        runhistory: RunHistory,
        additional_start_points: Optional[List[Tuple[float, Configuration]]],
    ) -> List[Configuration]:

        if runhistory.empty():
            init_points = self.config_space.sample_configuration(
                size=num_points)
        else:
            # initiate local search
            configs_previous_runs = runhistory.get_all_configs()

            # configurations with the highest previous EI
            configs_previous_runs_sorted = self._sort_configs_by_acq_value(
                configs_previous_runs)
            configs_previous_runs_sorted = [
                conf[1] for conf in configs_previous_runs_sorted[:num_points]
            ]

            # configurations with the lowest predictive cost, check for None to make unit tests work
            if self.acquisition_function.model is not None:
                conf_array = convert_configurations_to_array(
                    configs_previous_runs)
                costs = self.acquisition_function.model.predict_marginalized_over_instances(
                    conf_array)[0]
                # From here
                # http://stackoverflow.com/questions/20197990/how-to-make-argsort-result-to-be-random-between-equal-values
                random = self.rng.rand(len(costs))
                # Last column is primary sort key!
                indices = np.lexsort((random.flatten(), costs.flatten()))

                # Cannot use zip here because the indices array cannot index the
                # rand_configs list, because the second is a pure python list
                configs_previous_runs_sorted_by_cost = [
                    configs_previous_runs[ind] for ind in indices
                ][:num_points]
            else:
                configs_previous_runs_sorted_by_cost = []

            if additional_start_points is not None:
                additional_start_points = [
                    asp[1] for asp in additional_start_points[:num_points]
                ]
            else:
                additional_start_points = []

            init_points = []
            init_points_as_set = set()  # type: Set[Configuration]
            for cand in itertools.chain(
                    configs_previous_runs_sorted,
                    configs_previous_runs_sorted_by_cost,
                    additional_start_points,
            ):
                if cand not in init_points_as_set:
                    init_points.append(cand)
                    init_points_as_set.add(cand)

        return init_points
예제 #3
0
    def _next_challenger(
            self,
            challengers: typing.Optional[typing.List[Configuration]],
            chooser: typing.Optional[EPMChooser],
            run_history: RunHistory,
            repeat_configs: bool = True) -> typing.Optional[Configuration]:
        """ Retuns the next challenger to use in intensification
        If challenger is None, then optimizer will be used to generate the next challenger

        Parameters
        ----------
        challengers : typing.List[Configuration]
            promising configurations to evaluate next
        chooser : smac.optimizer.epm_configuration_chooser.EPMChooser
            a sampler that generates next configurations to use for racing
        run_history : smac.runhistory.runhistory.RunHistory
            stores all runs we ran so far
        repeat_configs : bool
            if False, an evaluated configuration will not be generated again

        Returns
        -------
        Configuration
            next challenger to use
        """
        start_time = time.time()

        used_configs = set(run_history.get_all_configs())

        if challengers:
            # iterate over challengers provided
            self.logger.debug("Using challengers provided")
            chall_gen = (c for c in challengers)  # type: _config_to_run_type
        elif chooser:
            # generating challengers on-the-fly if optimizer is given
            self.logger.debug("Generating new challenger from optimizer")
            chall_gen = chooser.choose_next()
        else:
            raise ValueError(
                'No configurations/chooser provided. Cannot generate challenger!'
            )

        self.logger.debug('Time to select next challenger: %.4f' %
                          (time.time() - start_time))

        # select challenger from the generators
        assert chall_gen is not None
        for challenger in chall_gen:
            # repetitions allowed
            if repeat_configs:
                return challenger

            # otherwise, select only a unique challenger
            if challenger not in used_configs:
                return challenger

        self.logger.debug("No valid challenger was generated!")
        return None
예제 #4
0
    def reduce_runhistory(self, rh: RunHistory, max_configs: int, keep=None):
        """
        Reduce configs to desired number, by default just drop the configs with the fewest runs.

        Parameters
        ----------
        rh: RunHistory
            runhistory that is to be reduced
        max_configs: int
            if > -1 reduce runhistory to at most max_configs
        keep: List[Configuration]
            list of configs that should be kept for sure (e.g. default, incumbents)

        Returns
        -------
        rh: RunHistory
            reduced runhistory
        """
        configs = rh.get_all_configs()
        if max_configs <= 0 or max_configs > len(configs):  # keep all
            return rh

        runs = [(c,
                 len(rh.get_runs_for_config(c,
                                            only_max_observed_budget=False)))
                for c in configs]
        if not keep:
            keep = []
        runs = sorted(runs, key=lambda x: x[1])[-self.max_plot:]
        keep = [r[0] for r in runs] + keep
        self.logger.info(
            "Reducing number of configs from %d to %d, dropping from the fewest evaluations",
            len(configs), len(keep))

        new_rh = RunHistory()
        for k, v in list(rh.data.items()):
            c = rh.ids_config[k.config_id]
            if c in keep:
                new_rh.add(config=rh.ids_config[k.config_id],
                           cost=v.cost,
                           time=v.time,
                           status=v.status,
                           instance_id=k.instance_id,
                           seed=k.seed)
        return new_rh
예제 #5
0
    def test_json_origin(self):

        for origin in ['test_origin', None]:
            rh = RunHistory()
            cs = get_config_space()
            config1 = Configuration(cs,
                                    values={'a': 1, 'b': 2},
                                    origin=origin)

            rh.add(config=config1, cost=10, time=20,
                   status=StatusType.SUCCESS, instance_id=1,
                   seed=1)

            path = 'test/test_files/test_json_origin.json'
            rh.save_json(path)
            _ = rh.load_json(path, cs)

            self.assertEqual(rh.get_all_configs()[0].origin, origin)

            os.remove(path)
예제 #6
0
    def __init__(
        self,
        original_rh: RunHistory,
        validated_rh: RunHistory,
        validator: Validator,
        scenario: Scenario,
        default: Configuration,
        incumbent: Configuration,
        param_imp: Union[None, Dict[str, float]],
        params: Union[int, List[str]],
        n_configs: int,
        pc_sort_by: str,
        output_dir: str,
        cs: ConfigurationSpace,
        runtime: bool = False,
        max_runs_epm: int = 3000000,
    ):
        """This function prepares the data from a SMAC-related
        format (using runhistories and parameters) to a more general format
        (using a dataframe). The resulting dataframe is passed to the
        parallel_coordinates-routine

        Parameters
        ----------
        original_rh: RunHistory
            runhistory that should contain only runs that were executed during search
        validated_rh: RunHistory
            runhistory that may contain as many runs as possible, also external runs.
            this runhistory will be used to build the EPM
        validator: Validator
            validator to be used to estimate costs for configurations
        scenario: Scenario
            scenario object to take instances from
        default, incumbent: Configuration
            default and incumbent, they will surely be displayed
        param_imp: Union[None, Dict[str->float]
            if given, maps parameter-names to importance
        params: Union[int, List[str]]
            either directly the parameters to displayed or the number of parameters (will try to define the most
            important ones
        n_configs: int
            number of configs to be plotted
        pc_sort_by: str
            defines the pimp-method by which to choose the plotted parameters
        max_runs_epm: int
            maximum number of runs to train the epm with. this should prevent MemoryErrors
        output_dir: str
            output directory for plots
        cs: ConfigurationSpace
            parameter configuration space to be visualized
        runtime: boolean
            runtime will be on logscale
        """

        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)
        self.error = None

        self.default = default
        self.param_imp = param_imp
        self.cs = cs

        # Sorting by importance, if possible (choose first executed parameter-importance)
        self.method, self.importance = "", {}
        if pc_sort_by == 'all':
            self.logger.debug("Sorting by average importance")
            self.method = 'average'
            for m, i in self.param_imp.items():
                if i:
                    for p, imp in i.items():
                        if p in self.importance:
                            self.importance[p].append(imp)
                        else:
                            self.importance[p] = [imp]
            self.importance = {
                k: sum(v) / len(v)
                for k, v in self.importance.items()
            }
        elif pc_sort_by in self.param_imp:
            self.method, self.importance = pc_sort_by, self.param_imp[
                pc_sort_by]
        else:
            self.logger.debug("%s not evaluated.. choosing at random from: %s",
                              pc_sort_by, str(list(self.param_imp.keys())))
            for m, i in self.param_imp.items():
                if i:
                    self.method, self.importance = m, i
                    break

        self.hp_names = sorted(
            [hp for hp in self.cs.get_hyperparameter_names()],
            key=lambda x: self.importance.get(x, 0),
            reverse=True)
        self.logger.debug("Sorted hp's by method \'%s\': %s", self.method,
                          str(self.hp_names))

        # To be set
        self.plots = []

        # Define set of configurations (limiting to max and choosing most interesting ones)
        all_configs = original_rh.get_all_configs()
        max_runs_epm = 300000  # Maximum total number of runs considered for epm to limit maximum possible number configs
        max_configs = int(
            max_runs_epm /
            (len(scenario.train_insts) + len(scenario.test_insts)))
        if len(all_configs) > max_configs:
            self.logger.debug(
                "Limiting number of configs to train epm from %d to %d (based on max runs %d) and choosing "
                "the ones with the most runs (for parallel coordinates)",
                len(all_configs), max_configs, max_runs_epm)
            all_configs = sorted(
                all_configs,
                key=lambda c: len(original_rh.get_runs_for_config(c)
                                  ))[:max_configs]
            if not default in all_configs:
                all_configs = [default] + all_configs
            if not incumbent in all_configs:
                all_configs.append(incumbent)

        # Get costs for those configurations
        epm_rh = RunHistory(average_cost)
        epm_rh.update(validated_rh)
        if scenario.feature_dict:  # if instances are available
            epm_rh.update(
                timing(validator.validate_epm)(all_configs,
                                               'train+test',
                                               1,
                                               runhistory=validated_rh))
        self.config_to_cost = {c: epm_rh.get_cost(c) for c in all_configs}

        self.params = self.get_params(params)
        self.n_configs = n_configs

        self.pcp = ParallelCoordinatesPlotter(self.config_to_cost, output_dir,
                                              cs, runtime)
예제 #7
0
class SMAC4EPMOpimizer(AbstractOptimizer):
    def __init__(self, api_config, config_space, parallel_setting="LS"):
        super(SMAC4EPMOpimizer, self).__init__(api_config)
        self.cs = config_space
        self.num_hps = len(self.cs.get_hyperparameters())

        if parallel_setting not in ["CL_min", "CL_max", "CL_mean", "KB", "LS"]:
            raise ValueError(
                "parallel_setting can only be one of the following: "
                "CL_min, CL_max, CL_mean, KB, LS")
        self.parallel_setting = parallel_setting

        rng = np.random.RandomState(seed=0)
        scenario = Scenario({
            "run_obj": "quality",  # we optimize quality (alt. to runtime)
            "runcount-limit": 128,
            "cs": self.cs,  # configuration space
            "deterministic": True,
            "limit_resources": False,
        })

        self.stats = Stats(scenario)
        # traj = TrajLogger(output_dir=None, stats=self.stats)

        self.runhistory = RunHistory()

        r2e_def_kwargs = {
            "scenario": scenario,
            "num_params": self.num_hps,
            "success_states": [
                StatusType.SUCCESS,
            ],
            "impute_censored_data": False,
            "scale_perc": 5,
        }

        self.random_chooser = ChooserProb(rng=rng, prob=0.0)

        types, bounds = get_types(self.cs, instance_features=None)
        model_kwargs = {
            "configspace": self.cs,
            "types": types,
            "bounds": bounds,
            "seed": rng.randint(MAXINT),
        }

        models = []

        cov_amp = ConstantKernel(
            2.0,
            constant_value_bounds=(np.exp(-10), np.exp(2)),
            prior=LognormalPrior(mean=0.0, sigma=1.0, rng=rng),
        )

        cont_dims = np.array(np.where(np.array(types) == 0)[0], dtype=np.int)
        cat_dims = np.where(np.array(types) != 0)[0]

        if len(cont_dims) > 0:
            exp_kernel = Matern(
                np.ones([len(cont_dims)]),
                [(np.exp(-6.754111155189306), np.exp(0.0858637988771976))
                 for _ in range(len(cont_dims))],
                nu=2.5,
                operate_on=cont_dims,
            )

        if len(cat_dims) > 0:
            ham_kernel = HammingKernel(
                np.ones([len(cat_dims)]),
                [(np.exp(-6.754111155189306), np.exp(0.0858637988771976))
                 for _ in range(len(cat_dims))],
                operate_on=cat_dims,
            )
        assert len(cont_dims) + len(cat_dims) == len(
            scenario.cs.get_hyperparameters())

        noise_kernel = WhiteKernel(
            noise_level=1e-8,
            noise_level_bounds=(np.exp(-25), np.exp(2)),
            prior=HorseshoePrior(scale=0.1, rng=rng),
        )

        if len(cont_dims) > 0 and len(cat_dims) > 0:
            # both
            kernel = cov_amp * (exp_kernel * ham_kernel) + noise_kernel
        elif len(cont_dims) > 0 and len(cat_dims) == 0:
            # only cont
            kernel = cov_amp * exp_kernel + noise_kernel
        elif len(cont_dims) == 0 and len(cat_dims) > 0:
            # only cont
            kernel = cov_amp * ham_kernel + noise_kernel
        else:
            raise ValueError()
        gp_kwargs = {"kernel": kernel}

        rf_kwargs = {}
        rf_kwargs["num_trees"] = model_kwargs.get("num_trees", 10)
        rf_kwargs["do_bootstrapping"] = model_kwargs.get(
            "do_bootstrapping", True)
        rf_kwargs["ratio_features"] = model_kwargs.get("ratio_features", 1.0)
        rf_kwargs["min_samples_split"] = model_kwargs.get(
            "min_samples_split", 2)
        rf_kwargs["min_samples_leaf"] = model_kwargs.get("min_samples_leaf", 1)
        rf_kwargs["log_y"] = model_kwargs.get("log_y", True)

        rf_log = RandomForestWithInstances(**model_kwargs, **rf_kwargs)

        rf_kwargs = copy.deepcopy(rf_kwargs)
        rf_kwargs["log_y"] = False
        rf_no_log = RandomForestWithInstances(**model_kwargs, **rf_kwargs)

        rh2epm_cost = RunHistory2EPM4Cost(**r2e_def_kwargs)
        rh2epm_log_cost = RunHistory2EPM4LogScaledCost(**r2e_def_kwargs)
        rh2epm_copula = RunHistory2EPM4GaussianCopulaCorrect(**r2e_def_kwargs)

        self.combinations = []

        # 2 models * 4 acquisition functions
        acq_funcs = [EI, PI, LogEI, LCB]
        acq_func_instances = []
        # acq_func_maximizer_instances = []

        n_sls_iterations = {
            1: 10,
            2: 10,
            3: 10,
            4: 10,
            5: 10,
            6: 10,
            7: 8,
            8: 6,
        }.get(len(self.cs.get_hyperparameters()), 5)

        acq_func_maximizer_kwargs = {
            "config_space": self.cs,
            "rng": rng,
            "max_steps": 5,
            "n_steps_plateau_walk": 5,
            "n_sls_iterations": n_sls_iterations,
        }
        self.idx_ei = 0

        self.num_models = len(models)
        self.num_acq_funcs = len(acq_funcs)

        no_transform_gp = GaussianProcess(**copy.deepcopy(model_kwargs),
                                          **copy.deepcopy(gp_kwargs))
        ei = EI(model=no_transform_gp)
        acq_func_maximizer_kwargs["acquisition_function"] = ei
        ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs)
        self.combinations.append((no_transform_gp, ei, ei_opt, rh2epm_cost))

        pi = PI(model=no_transform_gp)
        acq_func_maximizer_kwargs["acquisition_function"] = pi
        pi_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs)
        self.combinations.append((no_transform_gp, pi, pi_opt, rh2epm_cost))

        lcb = LCB(model=no_transform_gp)
        acq_func_maximizer_kwargs["acquisition_function"] = lcb
        lcb_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs)
        self.combinations.append((no_transform_gp, lcb, lcb_opt, rh2epm_cost))

        gp = GaussianProcess(**copy.deepcopy(model_kwargs),
                             **copy.deepcopy(gp_kwargs))
        ei = EI(model=gp)
        acq_func_maximizer_kwargs["acquisition_function"] = ei
        ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs)
        self.combinations.append((gp, ei, ei_opt, rh2epm_copula))

        gp = GaussianProcess(**copy.deepcopy(model_kwargs),
                             **copy.deepcopy(gp_kwargs))
        ei = LogEI(model=gp)
        acq_func_maximizer_kwargs["acquisition_function"] = ei
        ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs)
        self.combinations.append((gp, ei, ei_opt, rh2epm_log_cost))

        ei = EI(model=rf_no_log)
        acq_func_maximizer_kwargs["acquisition_function"] = ei
        ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs)
        self.combinations.append((rf_no_log, ei, ei_opt, rh2epm_cost))

        ei = LogEI(model=rf_log)
        acq_func_maximizer_kwargs["acquisition_function"] = ei
        ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs)
        self.combinations.append((rf_log, ei, ei_opt, rh2epm_log_cost))

        ei = EI(model=rf_no_log)
        acq_func_maximizer_kwargs["acquisition_function"] = ei
        ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs)
        self.combinations.append((rf_no_log, ei, ei_opt, rh2epm_copula))

        self.num_acq_instances = len(acq_func_instances)
        self.best_observation = np.inf

        self.next_evaluations = []

    def suggest(self, n_suggestions: int = 1) -> typing.List[typing.Dict]:
        """Get a suggestion from the optimizer.
        Parameters
        ----------
        n_suggestions : int
            Desired number of parallel suggestions in the output
        Returns
        -------
        next_guess : list of dict
            List of `n_suggestions` suggestions to evaluate the objective
            function. Each suggestion is a dictionary where each key
            corresponds to a parameter being optimized.
            CHANGED: each suggestion is a tuple of suggestion and string info!
        """
        all_previous_configs = self.runhistory.get_all_configs()
        num_points = len(all_previous_configs)

        # we will save our info
        info_list = []
        if len(self.next_evaluations) < n_suggestions:

            n_new = n_suggestions - len(self.next_evaluations)

            # import time

            order = np.random.permutation(list(range(len(self.combinations))))
            optimized_this_iter = set()
            while len(self.next_evaluations) < n_new:
                model, acq, acq_opt, rh2epm = self.combinations[order[len(
                    self.next_evaluations)]]
                # start_time = time.time()

                info = ""
                if model.__class__ == RandomForestWithInstances:
                    info += "RF"
                elif model.__class__ == GaussianProcess:
                    info += "GP"
                else:
                    raise ValueError(model.__class__.name)
                info += f" {acq.__class__.__name__}"
                if rh2epm.__class__ == RunHistory2EPM4Cost:
                    info += " cost"
                elif rh2epm.__class__ == RunHistory2EPM4LogScaledCost:
                    info += " log_cost"
                elif rh2epm.__class__ == RunHistory2EPM4GaussianCopulaCorrect:
                    info += " copula"
                else:
                    raise ValueError(rh2epm.__classs__.name__)

                # print(model.__class__.__name__,
                #       acq.__class__.__name__,
                #       rh2epm.__class__.__name__)

                X, y = rh2epm.transform(self.runhistory)

                # If all are not finite then we return nothing
                if np.all(~np.isfinite(y)):
                    self.next_evaluations = []
                    return []

                # Safeguard, just in case...
                if np.any(~np.isfinite(y)):
                    y[~np.isfinite(y)] = np.max(y[np.isfinite(y)])

                if (self.parallel_setting != "LS"
                        and len(self.next_evaluations) != 0):
                    x_inc = np.array([
                        next_config.get_array()
                        for next_config in self.next_evaluations
                    ])
                    if self.parallel_setting == "CL_min":
                        y_inc = np.min(y)
                    elif self.parallel_setting == "CL_max":
                        y_inc = np.max(y)
                    elif self.parallel_setting == "CL_mean":
                        y_inc = np.mean(y)
                    elif self.parallel_setting == "KB":
                        if model in optimized_this_iter and isinstance(
                                model, GaussianProcess):
                            # Safe some time by re-using the optimized
                            # hyperparameters from before
                            model._train(X, y, do_optimize=False)
                        else:
                            model.train(X, y)
                            optimized_this_iter.add(model)
                        y_inc, var = model.predict_marginalized_over_instances(
                            x_inc)
                        y_inc = y_inc.flatten()
                    else:
                        raise ValueError(
                            "parallel_setting can only be one of the "
                            "following: CL_min, CL_max, CL_mean, KB, LS")
                    if self.parallel_setting in ("CL_min", "CL_max",
                                                 "CL_mean"):  # NOQA
                        y_inc = np.repeat(y_inc,
                                          len(self.next_evaluations)).reshape(
                                              (-1, 1))
                    else:
                        y_inc = y_inc.reshape((-1, 1))
                    X = np.concatenate((X, x_inc))
                    y = np.concatenate((y, y_inc))
                    if (isinstance(model, GaussianProcess)
                            and self.parallel_setting == "KB"):
                        # Safe some time by re-using the optimized
                        # hyperparameters from above
                        model._train(X, y, do_optimize=False)
                    else:
                        model.train(X, y)
                        # As the training data for each subsequent model
                        # changes quite drastically (taking the max of all
                        # observations can create really disconnected error
                        # landscapes in the region of the optimum) we have
                        # to re-optimize the hyperparameters here and cannot
                        # add the model to the set of previously
                        # optimized models.
                        # optimized_this_iter.add(model)
                else:
                    model.train(X, y)
                    optimized_this_iter.add(model)

                predictions = model.predict_marginalized_over_instances(X)[0]
                best_index = np.argmin(predictions)
                best_observation = predictions[best_index]
                x_best_array = X[best_index]

                acq.update(
                    model=model,
                    eta=best_observation,
                    incumbent_array=x_best_array,
                    num_data=num_points,
                    X=X,
                )

                new_config_iterator = acq_opt.maximize(
                    runhistory=self.runhistory,
                    stats=self.stats,
                    num_points=10000,
                    random_configuration_chooser=self.random_chooser,
                )

                accept = False
                for next_config in new_config_iterator:
                    if (next_config in self.next_evaluations
                            or next_config in all_previous_configs):
                        continue
                    else:
                        accept = True
                        break
                if not accept:
                    # If we don't find anything within 100 random
                    # configurations, we re-run a configuration
                    for next_config in self.cs.sample_configuration(100):
                        if (next_config not in self.next_evaluations
                                or next_config in all_previous_configs):
                            break
                self.next_evaluations.append(next_config)
                info_list.append(info)
                # print(time.time() - start_time)

        next_guess = [{} for _ in range(n_suggestions)]
        while len(self.next_evaluations) < len(range(n_suggestions)):
            self.next_evaluations.append(self.cs.sample_configuration())
            info_list.append("Random")
        for i in range(n_suggestions):
            eval_next = self.next_evaluations.pop(0)
            next_guess[i] = (eval_next.get_dictionary(), info_list[i])
        return next_guess

    def init_with_rh(self, rh, iteration):
        self.runhistory.empty()
        for rh_value in rh:
            configuration = Configuration(configuration_space=self.cs,
                                          values=rh_value[0])
            self.runhistory.add(
                config=configuration,
                cost=rh_value[1],
                time=0,
                status=StatusType.SUCCESS,
            )

    def observe(self, X, y):
        """Feed an observation back.
        Parameters
        ----------
        X : list of dict-like
            Places where the objective function has already been evaluated.
            Each suggestion is a dictionary 使用where each key corresponds to a
            parameter being optimized.
        y : array-like, shape (n,)
            Corresponding values where objective has been evaluated
        """
        for xx, yy in zip(X, y):
            configuration = Configuration(configuration_space=self.cs,
                                          values=xx)
            self.runhistory.add(config=configuration,
                                cost=yy,
                                time=0,
                                status=StatusType.SUCCESS)
args, unkown = parser.parse_known_args()

logging.basicConfig(level=logging.INFO)
if unkown:
    logging.warning('Could not parse the following arguments: ')
    logging.warning(str(unkown))

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Create Runhistory object as well as scenario object
runhist = RunHistory(average_cost)
scenario = Scenario(args.scenario, cmd_args={'output_dir': ""})
cs = scenario.cs

runhist.load_json(args.data,
                  cs)  # populate the runhistory with the validation data
configs = runhist.get_all_configs()
def_ = cs.get_default_configuration()
def_dict = def_.get_dictionary()

# Switch it around such that statistics about the default are gathered first
if configs[0] != def_:
    tmp = configs[0]
    configs[0] = configs[1]
    configs[1] = tmp
    del tmp
logging.info('Found %d configs' % len(configs))
logging.info('Cost per config:')

# For each config
for config in configs:
    # gather statistics such as
예제 #9
0
    def _plot_parallel_coordinates(
        self,
        original_rh: RunHistory,
        validated_rh: RunHistory,
        validator: Validator,
        scenario: Scenario,
        default: Configuration,
        incumbent: Configuration,
        param_imp: Union[None, Dict[str, float]],
        output_dir: str,
        cs: ConfigurationSpace,
        runtime: bool = False,
    ):
        """
        Parameters:
        -----------
        original_rh: RunHistory
            runhistory that should contain only runs that were executed during search
        validated_rh: RunHistory
            runhistory that may contain as many runs as possible, also external runs.
            this runhistory will be used to build the EPM
        validator: Validator
            validator to be used to estimate costs for configurations
        scenario: Scenario
            scenario object to take instances from
        default, incumbent: Configuration
            default and incumbent, they will surely be displayed
        param_imp: Union[None, Dict[str->float]
            if given, maps parameter-names to importance
        output_dir: str
            output directory for plots
        cs: ConfigurationSpace
            parameter configuration space to be visualized
        runtime: boolean
            runtime will be on logscale
        """
        # Sorting parameters by importance, if possible (choose first executed parameter-importance)
        method, importance = "", {}
        if self.pc_sort_by == 'all':
            self.logger.debug("Sorting by average importance")
            method = 'average'
            for m, i in param_imp.items():
                if i:
                    for p, imp in i.items():
                        if p in importance:
                            importance[p].append(imp)
                        else:
                            importance[p] = [imp]
            importance = {k: sum(v) / len(v) for k, v in importance.items()}
        elif self.pc_sort_by in param_imp:
            method, importance = self.pc_sort_by, param_imp[self.pc_sort_by]
        else:
            self.logger.debug("%s not evaluated.. choosing at random from: %s",
                              self.pc_sort_by, str(list(param_imp.keys())))
            for m, i in param_imp.items():
                if i:
                    method, importance = m, i
                    self.logger.debug("Chose %s", method)
                    break

        hp_names = sorted([hp for hp in cs.get_hyperparameter_names()],
                          key=lambda x: importance.get(x, 0),
                          reverse=True)
        self.logger.debug("Sorted hp's by method \'%s\': %s", method,
                          str(hp_names))

        # To be set
        self.plots = []

        # Define set of configurations (limiting to max and choosing most interesting ones)
        all_configs = original_rh.get_all_configs()
        max_runs_epm = self.max_runs_epm  # Maximum total number of runs considered for epm to limit maximum possible number configs
        max_configs = int(
            max_runs_epm /
            (len(scenario.train_insts) + len(scenario.test_insts)))
        if len(all_configs) > max_configs:
            self.logger.debug(
                "Limiting number of configs to train epm from %d to %d (based on max runs %d) and choosing "
                "the ones with the most runs (for parallel coordinates)",
                len(all_configs), max_configs, max_runs_epm)
            all_configs = sorted(
                all_configs,
                key=lambda c: len(original_rh.get_runs_for_config(c)
                                  ))[:max_configs]
            if not default in all_configs:
                all_configs = [default] + all_configs
            if not incumbent in all_configs:
                all_configs.append(incumbent)

        # Get costs for those configurations
        epm_rh = RunHistory(average_cost)
        epm_rh.update(validated_rh)
        if scenario.feature_dict:  # if instances are available
            epm_rh.update(
                timing(validator.validate_epm)(all_configs,
                                               'train+test',
                                               1,
                                               runhistory=validated_rh))
        config_to_cost = {c: epm_rh.get_cost(c) for c in all_configs}

        pcp = ParallelCoordinatesPlotter(config_to_cost, output_dir, cs,
                                         runtime)

        try:
            plots = [
                pcp.plot_n_configs(
                    self.n_configs,
                    self.get_params(self.params, importance, hp_names))
            ]
            self.logger.debug("Paths to plot(s): %s", str(plots))
            return {'figure': plots}
        except ValueError as err:
            self.logger.debug("Error: %s", str(err))
            return {'else': str(err)}
예제 #10
0
    def _optimize(self, f, variables, X_init, Y_init, maxiter, maxeval,
                  iter_callback):
        maxeval = get_maxeval_for_bo(maxeval, maxiter)

        iter_callback(X_init[0], Y_init[0], X_init, Y_init)

        # Get config space
        config_space = self.get_config_space(variables)
        # get scenario, runhistory and stats
        scenario = self.get_scenario(maxeval, config_space)
        runhistory = RunHistory()
        stats = Stats(scenario)
        # for acq function optimizer
        rnd_chooser = ChooserProb(rng=self._get_random_state(), prob=0.0)
        # get class to get valid train data from run history
        rh2epm = self.get_runhistory2epm(scenario)

        # we will add configs to run history by using the following function
        def add_to_runhistory(config, cost):
            runhistory.add(
                config=config,
                cost=cost,
                time=0,
                status=StatusType.SUCCESS
            )

        # create gp and other stuff
        model = self.get_model(config_space)
        acq_fun = self.get_acquisition_function(model)
        acq_fun_opt = self.get_acquisition_function_optimizer(
            config_space,
            acq_fun
        )

        # transform our X_init for valid configurations
        # we create random valid configs and then fill them with our values
        X_init_configs = config_space.sample_configuration(len(X_init))
        for x in X_init:
            for i, x in enumerate(X_init):
                for ind, (var, par) in enumerate(zip(variables, x)):
                    if isinstance(variables[ind], ContinuousVariable):
                        par = float(par)
                    X_init_configs[i][var.name] = par

        # add our initial design to run history
        for x, y in zip(X_init_configs, Y_init):
            add_to_runhistory(x, y)

        # begin Bayesian optimization
        while self.run_info.result.n_eval < maxeval or \
                (maxiter is not None and
                 self.run_info.result.n_iter < maxiter):
            total_t_start = time.time()

            X, y = rh2epm.transform(runhistory)

            # If all are not finite then we return nothing
            if np.all(~np.isfinite(y)):
                return self.run_info.result

            # Safeguard, just in case...
            if np.any(~np.isfinite(y)):
                y[~np.isfinite(y)] = np.max(y[np.isfinite(y)])

            t_start = time.time()
            model.train(X, y)
            gp_train_time = time.time() - t_start

            t_start = time.time()
            predictions = model.predict_marginalized_over_instances(X)[0]
            best_index = np.argmin(predictions)
            best_observation = y[best_index]
            x_best_array = X[best_index]
            gp_predict_time = time.time() - t_start

            t_start = time.time()
            acq_fun.update(
                model=model,
                eta=best_observation,
                incumbent_array=x_best_array,
                num_data=len(X),
                X=X,
            )
            new_config_iterator = acq_fun_opt.maximize(
                runhistory=runhistory,
                stats=stats,
                num_points=10000,
                random_configuration_chooser=rnd_chooser,
            )
            accept = False
            for next_config in new_config_iterator:
                if next_config in runhistory.get_all_configs():
                    continue
                else:
                    accept = True
                    break
            assert accept
            acq_opt_time = time.time() - t_start

            t_start = time.time()
            x = [next_config[var.name] for var in variables]
            cost = f(x)
            eval_time = time.time() - t_start
            add_to_runhistory(next_config, cost)

            total_iter_time = time.time() - total_t_start
            update_kwargs = {"gp_train_time": gp_train_time,
                             "gp_predict_time": gp_predict_time,
                             "acq_opt_time": acq_opt_time,
                             "eval_time": eval_time,
                             "iter_time": total_iter_time}
            iter_callback(x, cost, [x], [cost], **update_kwargs)

        return self.run_info.result
예제 #11
0
    def _get_initial_points(
        self,
        num_points: int,
        runhistory: RunHistory,
        additional_start_points: Optional[List[Tuple[float, Configuration]]],
    ) -> List[Configuration]:

        if runhistory.empty():
            init_points = self.config_space.sample_configuration(
                size=num_points)
        else:
            # initiate local search
            configs_previous_runs = runhistory.get_all_configs()

            # configurations with the highest previous EI
            configs_previous_runs_sorted = self._sort_configs_by_acq_value(
                configs_previous_runs)
            configs_previous_runs_sorted = [
                conf[1] for conf in configs_previous_runs_sorted[:num_points]
            ]

            # configurations with the lowest predictive cost, check for None to make unit tests work
            if self.acquisition_function.model is not None:
                conf_array = convert_configurations_to_array(
                    configs_previous_runs)
                costs = self.acquisition_function.model.predict_marginalized_over_instances(
                    conf_array)[0]
                assert len(conf_array) == len(costs), (conf_array.shape,
                                                       costs.shape)

                # In case of the predictive model returning the prediction for more than one objective per configuration
                # (for example multi-objective or EIPS) it is not immediately clear how to sort according to the cost
                # of a configuration. Therefore, we simply follow the ParEGO approach and use a random scalarization.
                if len(costs.shape) == 2 and costs.shape[1] > 1:
                    weights = np.array(
                        [self.rng.rand() for _ in range(costs.shape[1])])
                    weights = weights / np.sum(weights)
                    costs = costs @ weights

                # From here
                # http://stackoverflow.com/questions/20197990/how-to-make-argsort-result-to-be-random-between-equal-values
                random = self.rng.rand(len(costs))
                # Last column is primary sort key!
                indices = np.lexsort((random.flatten(), costs.flatten()))

                # Cannot use zip here because the indices array cannot index the
                # rand_configs list, because the second is a pure python list
                configs_previous_runs_sorted_by_cost = [
                    configs_previous_runs[ind] for ind in indices
                ][:num_points]
            else:
                configs_previous_runs_sorted_by_cost = []

            if additional_start_points is not None:
                additional_start_points = [
                    asp[1] for asp in additional_start_points[:num_points]
                ]
            else:
                additional_start_points = []

            init_points = []
            init_points_as_set = set()  # type: Set[Configuration]
            for cand in itertools.chain(
                    configs_previous_runs_sorted,
                    configs_previous_runs_sorted_by_cost,
                    additional_start_points,
            ):
                if cand not in init_points_as_set:
                    init_points.append(cand)
                    init_points_as_set.add(cand)

        return init_points
예제 #12
0
    def _preprocess_budget(
        self,
        original_rh: RunHistory,
        validated_rh: RunHistory,
        validator: Validator,
        scenario: Scenario,
        default: Configuration,
        incumbent: Configuration,
        param_imp: Union[None, Dict[str, float]],
        output_dir: str,
        cs: ConfigurationSpace,
        runtime: bool = False,
    ):
        """
        Preprocess data and save in self.data to enable fast replots

        Parameters:
        -----------
        original_rh: RunHistory
            runhistory that should contain only runs that were executed during search
        validated_rh: RunHistory
            runhistory that may contain as many runs as possible, also external runs.
            this runhistory will be used to build the EPM
        validator: Validator
            validator to be used to estimate costs for configurations
        scenario: Scenario
            scenario object to take instances from
        default, incumbent: Configuration
            default and incumbent, they will surely be displayed
        param_imp: Union[None, Dict[str->float]
            if given, maps parameter-names to importance
        output_dir: str
            output directory for plots
        cs: ConfigurationSpace
            parameter configuration space to be visualized
        runtime: boolean
            runtime will be on logscale
        """
        # Sorting parameters by importance, if possible (choose first executed parameter-importance)
        method, importance = "", {}
        if self.pc_sort_by == 'all':
            self.logger.debug("Sorting by average importance")
            method = 'average'
            for m, i in param_imp.items():
                if i:
                    for p, imp in i.items():
                        if p in importance:
                            importance[p].append(imp)
                        else:
                            importance[p] = [imp]
            importance = {k: sum(v) / len(v) for k, v in importance.items()}
        elif self.pc_sort_by in param_imp:
            method, importance = self.pc_sort_by, param_imp[self.pc_sort_by]
        else:
            self.logger.debug("%s not evaluated.. choosing at random from: %s",
                              self.pc_sort_by, str(list(param_imp.keys())))
            for m, i in param_imp.items():
                if i:
                    method, importance = m, i
                    self.logger.debug("Chose %s", method)
                    break

        hp_names = sorted([p for p in cs.get_hyperparameter_names()],
                          key=lambda x: importance.get(x, 0),
                          reverse=True)
        self.logger.debug("Sorted hyperparameters by method \'%s\': %s",
                          method, str(hp_names))

        # Define set of configurations (limiting to max and choosing most interesting ones)
        all_configs = original_rh.get_all_configs()
        # max_runs_epm is the maximum total number of runs considered for epm to limit maximum possible number configs
        max_configs = int(
            self.max_runs_epm /
            (len(scenario.train_insts) + len(scenario.test_insts)))
        if len(all_configs) > max_configs:
            self.logger.debug(
                "Limiting number of configs to train epm from %d to %d (based on max runs %d) and "
                "choosing the ones with the most runs (for parallel coordinates)",
                len(all_configs), max_configs, self.max_runs_epm)
            all_configs = sorted(all_configs,
                                 key=lambda c: len(
                                     original_rh.get_runs_for_config(
                                         c, only_max_observed_budget=False)))
            all_configs = all_configs[:max_configs]
            if default not in all_configs:
                all_configs = [default] + all_configs
            if incumbent not in all_configs:
                all_configs.append(incumbent)

        # Get costs for those configurations
        epm_rh = RunHistory()
        epm_rh.update(validated_rh)
        if scenario.feature_dict:  # if instances are available
            epm_rh.update(
                timing(validator.validate_epm)(all_configs,
                                               'train+test',
                                               1,
                                               runhistory=validated_rh))
        config_to_cost = OrderedDict(
            {c: epm_rh.get_cost(c)
             for c in all_configs})

        data = OrderedDict()
        data['cost'] = list(config_to_cost.values())
        for hp in self.runscontainer.scenario.cs.get_hyperparameter_names():
            data[hp] = np.array([
                c[hp]  # if hp in c.get_dictionary() and not isinstance(c[hp], str) else np.nan
                for c in config_to_cost.keys()
            ])
        df = pd.DataFrame(data=data)
        return df
예제 #13
0
    def _optimize(self, f, variables, X_init, Y_init, maxiter, maxeval,
                  iter_callback):
        maxeval = get_maxeval_for_bo(maxeval, maxiter)

        # Create help optimizer with usual SMAC to call the same functions
        help_opt = SMACBayesianOptimizer(kernel="Auto")
        help_opt_log = SMACBayesianOptimizer(kernel="Auto",
                                             acquisition_type="logEI")

        kernel_name1, message1 = choose_kernel_if_needed(
            optimizer=help_opt,
            variables=variables,
            X=X_init,
            Y=Y_init,
            kernels=self.kernels_to_choose)
        help_opt.kernel_name = kernel_name1

        kernel_name2, message2 = choose_kernel_if_needed(
            optimizer=help_opt_log,
            variables=variables,
            X=X_init,
            Y=Y_init,
            kernels=self.kernels_to_choose)
        help_opt_log.kernel_name = kernel_name2

        message = f"For usual Y:\n{message1}For log_transformed Y:\n{message2}"

        x_best = X_init[0]
        y_best = Y_init[0]
        iter_callback(x_best, y_best, X_init, Y_init, message=message)

        # Get config space
        config_space = help_opt.get_config_space(variables)
        # get scenario, runhistory and stats
        scenario = help_opt.get_scenario(maxeval, config_space)
        runhistory = RunHistory()
        stats = Stats(scenario)
        # for acq function optimizer
        rnd_chooser = ChooserProb(rng=help_opt._get_random_state(), prob=0.0)

        # get classes to get valid train data from run history
        rh2epm_no_transform = help_opt.get_runhistory2epm(scenario)
        rh2epm_log = help_opt_log.get_runhistory2epm(scenario)
        acq2rh2epm = {"PI": rh2epm_no_transform, "logEI": rh2epm_log}

        # we will add configs to run history by using the following function
        def add_to_runhistory(config, cost):
            runhistory.add(config=config,
                           cost=cost,
                           time=0,
                           status=StatusType.SUCCESS)

        combinations = []
        for model_name, acq_name in [(kernel_name1, "PI"),
                                     (kernel_name2, "logEI")]:
            gp = self._create_gp_model(config_space, model_name)
            acq = self._create_acquisition_function(model=gp,
                                                    acquisition_name=acq_name)
            acq_opt = help_opt.get_acquisition_function_optimizer(
                config_space, acq)
            mark = f"{model_name}_{acq_name}"
            combinations.append((mark, gp, acq, acq_opt, acq2rh2epm[acq_name]))

        # transform our X_init for valid configurations
        # we create random valid configs and then fill them with our values
        X_init_configs = config_space.sample_configuration(len(X_init))
        for x in X_init:
            for i, x in enumerate(X_init):
                for ind, (var, par) in enumerate(zip(variables, x)):
                    if isinstance(variables[ind], ContinuousVariable):
                        par = float(par)
                    X_init_configs[i][var.name] = par

        # add our initial design to run history
        for x, y in zip(X_init_configs, Y_init):
            add_to_runhistory(x, y)

        # begin Bayesian optimization
        while self.run_info.result.n_eval < maxeval or \
                (maxiter is not None and
                 self.run_info.result.n_iter * 4 < maxiter):
            do_gp_optim = self.do_gp_optimization()
            message = f"GP was optimized: {do_gp_optim}"

            total_t_start = time.time()

            gp_train_time = 0
            gp_predict_time = 0
            acq_opt_time = 0
            eval_time = 0
            iter_configs = []
            iter_X = []
            iter_y = []

            # shuffle our list in-place
            # random.shuffle(combinations)
            kernel_ind = np.random.choice([0, 1])
            mark, gp, acq, acq_opt, rh2epm = combinations[kernel_ind]
            # for mark, gp, acq, acq_opt, rh2epm in combinations:
            X, y = rh2epm.transform(runhistory)

            # If all are not finite then we return nothing
            if np.all(~np.isfinite(y)):
                return self.run_info.result

            # Safeguard, just in case...
            if np.any(~np.isfinite(y)):
                y[~np.isfinite(y)] = np.max(y[np.isfinite(y)])

            t_start = time.time()
            gp.train(X, y, optimize=do_gp_optim)
            gp_train_time += time.time() - t_start

            t_start = time.time()
            # we do not care what model is used here
            predictions = gp.predict(X)[0]
            best_index = np.argmin(predictions)
            best_observation = y[best_index]
            x_best_array = X[best_index]
            gp_predict_time += time.time() - t_start

            t_start = time.time()
            acq.update(
                model=gp.gp_model,
                eta=best_observation,
                incumbent_array=x_best_array,
                num_data=len(X),
                X=X,
            )
            new_config_iterator = acq_opt.maximize(
                runhistory=runhistory,
                stats=stats,
                num_points=10000,
                random_configuration_chooser=rnd_chooser,
            )
            accept = False
            for next_config in new_config_iterator:
                if next_config in runhistory.get_all_configs():
                    continue
                else:
                    accept = True
                    break
            assert accept
            acq_opt_time += time.time() - t_start

            t_eval = time.time()
            x = [next_config[var.name] for var in variables]
            cost = f(x)
            eval_time += time.time() - t_eval

            iter_configs.append((next_config, cost))

            x = WeightedMetaArray(x)
            x.metadata = mark
            iter_X.append(x)
            iter_y.append(cost)
            if cost < y_best:
                x_best = x
                y_best = cost

            for config, cost in iter_configs:
                add_to_runhistory(config, cost)

            total_iter_time = time.time() - total_t_start
            update_kwargs = {
                "gp_train_time": gp_train_time,
                "gp_predict_time": gp_predict_time,
                "acq_opt_time": acq_opt_time,
                "eval_time": eval_time,
                "iter_time": total_iter_time,
                "message": message
            }
            iter_callback(x_best, y_best, iter_X, iter_y, **update_kwargs)

        return self.run_info.result
예제 #14
0
class CAVE(object):
    """
    """
    def __init__(self,
                 folders: typing.List[str],
                 output: str,
                 ta_exec_dir: Union[str, None] = None,
                 missing_data_method: str = 'epm',
                 max_pimp_samples: int = -1,
                 fanova_pairwise=True):
        """
        Initialize CAVE facade to handle analyzing, plotting and building the
        report-page easily. During initialization, the analysis-infrastructure
        is built and the data is validated, meaning the overall best
        incumbent is found and default+incumbent are evaluated for all
        instances for all runs, by default using an EPM.
        The class holds two runhistories:
            self.original_rh -> only contains runs from the actual data
            self.validated_rh -> contains original runs and epm-predictions for
                                 all incumbents
        The analyze()-method performs an analysis and outputs a report.html.

        Arguments
        ---------
        folders: list<strings>
            paths to relevant SMAC runs
        output: string
            output for cave to write results (figures + report)
        ta_exec_dir: string
            execution directory for target algorithm (to find instance.txt, ..)
        missing_data_method: string
            from [validation, epm], how to estimate missing runs
        """
        self.logger = logging.getLogger("cave.cavefacade")
        self.logger.debug("Folders: %s", str(folders))
        self.ta_exec_dir = ta_exec_dir

        # Create output if necessary
        self.output = output
        self.logger.info("Saving results to %s", self.output)
        if not os.path.exists(output):
            self.logger.debug("Output-dir %s does not exist, creating",
                              self.output)
            os.makedirs(output)
        if not os.path.exists(os.path.join(self.output, "debug")):
            os.makedirs(os.path.join(self.output, "debug"))
        # Log to file
        logger = logging.getLogger()
        handler = logging.FileHandler(
            os.path.join(self.output, "debug/debug.log"), "w")
        handler.setLevel(logging.DEBUG)
        logger.addHandler(handler)

        # Global runhistory combines all actual runs of individual SMAC-runs
        # We save the combined (unvalidated) runhistory to disk, so we can use it later on.
        # We keep the validated runhistory (with as many runs as possible) in
        # memory. The distinction is made to avoid using runs that are
        # only estimated using an EPM for further EPMs or to handle runs
        # validated on different hardware (depending on validation-method).
        self.original_rh = RunHistory(average_cost)
        self.validated_rh = RunHistory(average_cost)

        # Save all relevant SMAC-runs in a list
        self.runs = []
        for folder in folders:
            try:
                self.logger.debug("Collecting data from %s.", folder)
                self.runs.append(SMACrun(folder, ta_exec_dir))
            except Exception as err:
                self.logger.warning(
                    "Folder %s could not be loaded, failed "
                    "with error message: %s", folder, err)
                continue
        if not len(self.runs):
            raise ValueError(
                "None of the specified SMAC-folders could be loaded.")

        # Use scenario of first run for general purposes (expecting they are all the same anyway!)
        self.scenario = self.runs[0].solver.scenario

        # Update global runhistory with all available runhistories
        self.logger.debug("Update original rh with all available rhs!")
        runhistory_fns = [
            os.path.join(run.folder, "runhistory.json") for run in self.runs
        ]
        for rh_file in runhistory_fns:
            self.original_rh.update_from_json(rh_file, self.scenario.cs)
        self.logger.debug(
            'Combined number of Runhistory data points: %d. '
            '# Configurations: %d. # Runhistories: %d',
            len(self.original_rh.data),
            len(self.original_rh.get_all_configs()), len(runhistory_fns))
        self.original_rh.save_json(
            os.path.join(self.output, "combined_rh.json"))

        # Validator for a) validating with epm, b) plot over time
        # Initialize without trajectory
        self.validator = Validator(self.scenario, None, None)

        # Estimate missing costs for [def, inc1, inc2, ...]
        self.complete_data(method=missing_data_method)
        self.best_run = min(
            self.runs,
            key=lambda run: self.validated_rh.get_cost(run.solver.incumbent))

        self.default = self.scenario.cs.get_default_configuration()
        self.incumbent = self.best_run.solver.incumbent

        self.logger.debug("Overall best run: %s, with incumbent: %s",
                          self.best_run.folder, self.incumbent)

        # Following variable determines whether a distinction is made
        # between train and test-instances (e.g. in plotting)
        self.train_test = bool(self.scenario.train_insts != [None]
                               and self.scenario.test_insts != [None])

        self.analyzer = Analyzer(self.original_rh, self.validated_rh,
                                 self.default, self.incumbent, self.train_test,
                                 self.scenario, self.validator, self.output,
                                 max_pimp_samples, fanova_pairwise)

        self.builder = HTMLBuilder(self.output, "CAVE")
        # Builder for html-website
        self.website = OrderedDict([])

    def complete_data(self, method="epm"):
        """Complete missing data of runs to be analyzed. Either using validation
        or EPM.
        """
        with changedir(self.ta_exec_dir if self.ta_exec_dir else '.'):
            self.logger.info("Completing data using %s.", method)

            path_for_validated_rhs = os.path.join(self.output, "validated_rhs")
            for run in self.runs:
                self.validator.traj = run.traj
                if method == "validation":
                    # TODO determine # repetitions
                    new_rh = self.validator.validate(
                        'def+inc',
                        'train+test',
                        1,
                        -1,
                        runhistory=self.original_rh)
                elif method == "epm":
                    new_rh = self.validator.validate_epm(
                        'def+inc',
                        'train+test',
                        1,
                        runhistory=self.original_rh)
                else:
                    raise ValueError("Missing data method illegal (%s)",
                                     method)
                self.validator.traj = None  # Avoid usage-mistakes
                self.validated_rh.update(new_rh)

    def analyze(self,
                performance=True,
                cdf=True,
                scatter=True,
                confviz=True,
                param_importance=['forward_selection', 'ablation', 'fanova'],
                feature_analysis=[
                    "box_violin", "correlation", "feat_importance",
                    "clustering", "feature_cdf"
                ],
                parallel_coordinates=True,
                cost_over_time=True,
                algo_footprint=True):
        """Analyze the available data and build HTML-webpage as dict.
        Save webpage in 'self.output/CAVE/report.html'.
        Analyzing is performed with the analyzer-instance that is initialized in
        the __init__

        Parameters
        ----------
        performance: bool
            whether to calculate par10-values
        cdf: bool
            whether to plot cdf
        scatter: bool
            whether to plot scatter
        confviz: bool
            whether to perform configuration visualization
        param_importance: List[str]
            containing methods for parameter importance
        feature_analysis: List[str]
            containing methods for feature analysis
        parallel_coordinates: bool
            whether to plot parallel coordinates
        cost_over_time: bool
            whether to plot cost over time
        algo_footprint: bool
            whether to plot algorithm footprints
        """

        # Check arguments
        for p in param_importance:
            if p not in [
                    'forward_selection', 'ablation', 'fanova', 'incneighbor'
            ]:
                raise ValueError(
                    "%s not a valid option for parameter "
                    "importance!", p)
        for f in feature_analysis:
            if f not in [
                    "box_violin", "correlation", "importance", "clustering",
                    "feature_cdf"
            ]:
                raise ValueError("%s not a valid option for feature analysis!",
                                 f)

        # Start analysis
        overview = self.analyzer.create_overview_table(self.best_run.folder)
        self.website["Meta Data"] = {"table": overview}

        compare_config = self.analyzer.config_to_html(self.default,
                                                      self.incumbent)
        self.website["Best configuration"] = {"table": compare_config}

        ########## PERFORMANCE ANALYSIS
        self.website["Performance Analysis"] = OrderedDict()

        if performance:
            performance_table = self.analyzer.create_performance_table(
                self.default, self.incumbent)
            self.website["Performance Analysis"]["Performance Table"] = {
                "table": performance_table
            }

        if cdf:
            cdf_path = self.analyzer.plot_cdf()
            self.website["Performance Analysis"][
                "empirical Cumulative Distribution Function (eCDF)"] = {
                    "figure": cdf_path
                }

        if scatter and (self.scenario.train_insts != [[None]]):
            scatter_path = self.analyzer.plot_scatter()
            self.website["Performance Analysis"]["Scatterplot"] = {
                "figure": scatter_path
            }
        elif scatter:
            self.logger.info(
                "Scatter plot desired, but no instances available.")

        # Build report before time-consuming analysis
        self.build_website()

        if algo_footprint and self.scenario.feature_dict:
            algorithms = {self.default: "default", self.incumbent: "incumbent"}
            # Add all available incumbents to test portfolio strategy
            #for r in self.runs:
            #    if not r.get_incumbent() in algorithms:
            #        algorithms[r.get_incumbent()] = str(self.runs.index(r))

            algo_footprint_plots = self.analyzer.plot_algorithm_footprint(
                algorithms)
            self.website["Performance Analysis"][
                "Algorithm Footprints"] = OrderedDict()
            for p in algo_footprint_plots:
                header = os.path.splitext(os.path.split(p)[1])[0]  # algo name
                self.website["Performance Analysis"]["Algorithm Footprints"][
                    header] = {
                        "figure": p,
                        "tooltip":
                        get_tooltip("Algorithm Footprints") + ": " + header
                    }

        self.build_website()

        ########### Configurator's behavior
        self.website["Configurator's behavior"] = OrderedDict()

        if confviz:
            if self.scenario.feature_array is None:
                self.scenario.feature_array = np.array([[]])
            # Sort runhistories and incs wrt cost
            incumbents = [r.solver.incumbent for r in self.runs]
            trajectories = [r.traj for r in self.runs]
            runhistories = [r.runhistory for r in self.runs]
            costs = [self.validated_rh.get_cost(i) for i in incumbents]
            costs, incumbents, runhistories, trajectories = (
                list(t) for t in zip(
                    *sorted(zip(costs, incumbents, runhistories, trajectories),
                            key=lambda x: x[0])))
            incumbents = list(map(lambda x: x['incumbent'], trajectories[0]))

            confviz_script = self.analyzer.plot_confviz(
                incumbents, runhistories)
            self.website["Configurator's behavior"][
                "Configurator Footprint"] = {
                    "table": confviz_script
                }
        elif confviz:
            self.logger.info("Configuration visualization desired, but no "
                             "instance-features available.")

        self.build_website()

        if cost_over_time:
            cost_over_time_path = self.analyzer.plot_cost_over_time(
                self.best_run.traj, self.validator)
            self.website["Configurator's behavior"]["Cost over time"] = {
                "figure": cost_over_time_path
            }

        self.build_website()

        self.parameter_importance(ablation='ablation' in param_importance,
                                  fanova='fanova' in param_importance,
                                  forward_selection='forward_selection'
                                  in param_importance,
                                  incneighbor='incneighbor'
                                  in param_importance)

        self.build_website()

        if parallel_coordinates:
            # Should be after parameter importance, if performed.
            n_params = 6
            parallel_path = self.analyzer.plot_parallel_coordinates(n_params)
            self.website["Configurator's behavior"]["Parallel Coordinates"] = {
                "figure": parallel_path
            }

        self.build_website()

        if self.scenario.feature_dict:
            self.feature_analysis(box_violin='box_violin' in feature_analysis,
                                  correlation='correlation'
                                  in feature_analysis,
                                  clustering='clustering' in feature_analysis,
                                  importance='importance' in feature_analysis)
        else:
            self.logger.info('No feature analysis possible')

        self.logger.info("CAVE finished. Report is located in %s",
                         os.path.join(self.output, 'report.html'))

        self.build_website()

    def parameter_importance(self,
                             ablation=False,
                             fanova=False,
                             forward_selection=False,
                             incneighbor=False):
        """Perform the specified parameter importance procedures. """
        # PARAMETER IMPORTANCE
        if (ablation or forward_selection or fanova or incneighbor):
            self.website["Parameter Importance"] = OrderedDict()
        sum_ = 0
        if fanova:
            sum_ += 1
            table, plots, pair_plots = self.analyzer.fanova(self.incumbent)

            self.website["Parameter Importance"]["fANOVA"] = OrderedDict()

            self.website["Parameter Importance"]["fANOVA"]["Importance"] = {
                "table": table
            }
            # Insert plots (the received plots is a dict, mapping param -> path)
            self.website["Parameter Importance"]["fANOVA"][
                "Marginals"] = OrderedDict([])
            for param, plot in plots.items():
                self.website["Parameter Importance"]["fANOVA"]["Marginals"][
                    param] = {
                        "figure": plot
                    }
            if pair_plots:
                self.website["Parameter Importance"]["fANOVA"][
                    "PairwiseMarginals"] = OrderedDict([])
                for param, plot in pair_plots.items():
                    self.website["Parameter Importance"]["fANOVA"][
                        "PairwiseMarginals"][param] = {
                            "figure": plot
                        }

        if ablation:
            sum_ += 1
            self.logger.info("Ablation...")
            self.analyzer.parameter_importance("ablation", self.incumbent,
                                               self.output)
            ablationpercentage_path = os.path.join(self.output,
                                                   "ablationpercentage.png")
            ablationperformance_path = os.path.join(self.output,
                                                    "ablationperformance.png")
            self.website["Parameter Importance"]["Ablation"] = {
                "figure": [ablationpercentage_path, ablationperformance_path]
            }

        if forward_selection:
            sum_ += 1
            self.logger.info("Forward Selection...")
            self.analyzer.parameter_importance("forward-selection",
                                               self.incumbent, self.output)
            f_s_barplot_path = os.path.join(self.output,
                                            "forward selection-barplot.png")
            f_s_chng_path = os.path.join(self.output,
                                         "forward selection-chng.png")
            self.website["Parameter Importance"]["Forward Selection"] = {
                "figure": [f_s_barplot_path, f_s_chng_path]
            }

        if incneighbor:
            sum_ += 1
            self.logger.info("Local EPM-predictions around incumbent...")
            plots = self.analyzer.local_epm_plots()
            self.website["Parameter Importance"][
                "Local Parameter Importance (LPI)"] = OrderedDict([])
            for param, plot in plots.items():
                self.website["Parameter Importance"][
                    "Local Parameter Importance (LPI)"][param] = {
                        "figure": plot
                    }

        if sum_:
            of = os.path.join(self.output, 'pimp.tex')
            self.logger.info('Creating pimp latex table at %s' % of)
            self.analyzer.pimp.table_for_comparison(self.analyzer.evaluators,
                                                    of,
                                                    style='latex')

    def feature_analysis(self,
                         box_violin=False,
                         correlation=False,
                         clustering=False,
                         importance=False):
        if not (box_violin or correlation or clustering or importance):
            self.logger.debug("No feature analysis.")
            return

        # FEATURE ANALYSIS (ASAPY)
        # TODO make the following line prettier
        # TODO feat-names from scenario?
        in_reader = InputReader()
        feat_fn = self.scenario.feature_fn

        if not self.scenario.feature_names:
            with changedir(self.ta_exec_dir if self.ta_exec_dir else '.'):
                if not feat_fn or not os.path.exists(feat_fn):
                    self.logger.warning(
                        "Feature Analysis needs valid feature "
                        "file! Either {} is not a valid "
                        "filename or features are not saved in "
                        "the scenario.")
                    self.logger.error("Skipping Feature Analysis.")
                    return
                else:
                    feat_names = in_reader.read_instance_features_file(
                        self.scenario.feature_fn)[0]
        else:
            feat_names = copy.deepcopy(self.scenario.feature_names)

        self.website["Feature Analysis"] = OrderedDict([])

        # feature importance using forward selection
        if importance:
            self.website["Feature Analysis"][
                "Feature Importance"] = OrderedDict()
            imp, plots = self.analyzer.feature_importance()
            imp = DataFrame(data=list(imp.values()),
                            index=list(imp.keys()),
                            columns=["Error"])
            imp = imp.to_html()  # this is a table with the values in html
            self.website["Feature Analysis"]["Feature Importance"]["Table"] = {
                "table": imp
            }
            for p in plots:
                name = os.path.splitext(os.path.basename(p))[0]
                self.website["Feature Analysis"]["Feature Importance"][
                    name] = {
                        "figure": p
                    }

        # box and violin plots
        if box_violin:
            name_plots = self.analyzer.feature_analysis(
                'box_violin', feat_names)
            self.website["Feature Analysis"][
                "Violin and Box Plots"] = OrderedDict()
            for plot_tuple in name_plots:
                key = "%s" % (plot_tuple[0])
                self.website["Feature Analysis"]["Violin and Box Plots"][
                    key] = {
                        "figure": plot_tuple[1]
                    }

        # correlation plot
        if correlation:
            correlation_plot = self.analyzer.feature_analysis(
                'correlation', feat_names)
            if correlation_plot:
                self.website["Feature Analysis"]["Correlation"] = {
                    "figure": correlation_plot
                }

        # cluster instances in feature space
        if clustering:
            cluster_plot = self.analyzer.feature_analysis(
                'clustering', feat_names)
            self.website["Feature Analysis"]["Clustering"] = {
                "figure": cluster_plot
            }

        self.build_website()

    def build_website(self):
        self.builder.generate_html(self.website)