Пример #1
0
 def test_psmac(self):
     with joblib.parallel_backend('multiprocessing', n_jobs=1):
         optimizer = PSMAC(self.scenario, n_optimizers=3, n_incs=2, validate=False)
         incs = optimizer.optimize()
         self.assertEqual(len(incs), 2)
         optimizer = PSMAC(self.scenario, n_optimizers=1, n_incs=4, validate=False)
         incs = optimizer.optimize()
         self.assertEqual(len(incs), 2)
         optimizer = PSMAC(self.scenario, n_optimizers=5, n_incs=4, validate=False)
         incs = optimizer.optimize()
         self.assertEqual(len(incs), 4)
Пример #2
0
class Hydra(object):
    """
    Facade to use Hydra default mode

    Attributes
    ----------
    logger
    stats : Stats
        loggs information about used resources
    solver : SMBO
        handles the actual algorithm calls
    rh : RunHistory
        List with information about previous runs
    portfolio : list
        List of all incumbents

    """
    def __init__(self,
                 scenario: Scenario,
                 n_iterations: int,
                 val_set: str = 'train',
                 incs_per_round: int = 1,
                 n_optimizers: int = 1,
                 rng: typing.Optional[typing.Union[np.random.RandomState,
                                                   int]] = None,
                 run_id: int = 1,
                 tae: typing.Type[ExecuteTARun] = ExecuteTARunOld,
                 tae_kwargs: typing.Union[dict, None] = None,
                 **kwargs):
        """
        Constructor

        Parameters
        ----------
        scenario : ~smac.scenario.scenario.Scenario
            Scenario object
        n_iterations: int,
            number of Hydra iterations
        val_set: str
            Set to validate incumbent(s) on. [train, valX].
            train => whole training set,
            valX => train_set * 100/X where X in (0, 100)
        incs_per_round: int
            Number of incumbents to keep per round
        n_optimizers: int
            Number of optimizers to run in parallel per round
        rng: int/np.random.RandomState
            The randomState/seed to pass to each smac run
        run_id: int
            run_id for this hydra run
        tae: ExecuteTARun
            Target Algorithm Runner (supports old and aclib format as well as AbstractTAFunc)
        tae_kwargs: Optional[dict]
            arguments passed to constructor of '~tae'

        """
        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)

        self.n_iterations = n_iterations
        self.scenario = scenario
        self.run_id, self.rng = get_rng(rng, run_id, self.logger)
        self.kwargs = kwargs
        self.output_dir = None
        self.top_dir = None
        self.solver = None
        self.portfolio = None
        self.rh = RunHistory(average_cost, file_system=scenario.file_system)
        self._tae = tae
        self._tae_kwargs = tae_kwargs
        if incs_per_round <= 0:
            self.logger.warning('Invalid value in %s: %d. Setting to 1',
                                'incs_per_round', incs_per_round)
        self.incs_per_round = max(incs_per_round, 1)
        if n_optimizers <= 0:
            self.logger.warning('Invalid value in %s: %d. Setting to 1',
                                'n_optimizers', n_optimizers)
        self.n_optimizers = max(n_optimizers, 1)
        self.val_set = self._get_validation_set(val_set)
        self.cost_per_inst = {}
        self.optimizer = None
        self.portfolio_cost = None

    def _get_validation_set(self,
                            val_set: str,
                            delete: bool = True) -> typing.List[str]:
        """
        Create small validation set for hydra to determine incumbent performance

        Parameters
        ----------
        val_set: str
            Set to validate incumbent(s) on. [train, valX].
            train => whole training set,
            valX => train_set * 100/X where X in (0, 100)
        delete: bool
            Flag to delete all validation instances from the training set

        Returns
        -------
        val: typing.List[str]
            List of instance-ids to validate on

        """
        if val_set == 'none':
            return None
        if val_set == 'train':
            return self.scenario.train_insts
        elif val_set[:3] != 'val':
            self.logger.warning(
                'Can not determine validation set size. Using full training-set!'
            )
            return self.scenario.train_insts
        else:
            size = int(val_set[3:]) / 100
            if size <= 0 or size >= 1:
                raise ValueError(
                    'X invalid in valX, should be between 0 and 1')
            insts = np.array(self.scenario.train_insts)
            # just to make sure this also works with the small example we have to round up to 3
            size = max(np.floor(insts.shape[0] * size).astype(int), 3)
            ids = np.random.choice(insts.shape[0], size, replace=False)
            val = insts[ids].tolist()
            if delete:
                self.scenario.train_insts = np.delete(insts, ids).tolist()
            return val

    def optimize(self) -> typing.List[Configuration]:
        """
        Optimizes the algorithm provided in scenario (given in constructor)

        Returns
        -------
        portfolio : typing.List[Configuration]
            Portfolio of found configurations

        """
        # Setup output directory
        self.portfolio = []
        portfolio_cost = np.inf
        if self.output_dir is None:
            self.top_dir = "hydra-output_%s" % (
                datetime.datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f'))
            self.scenario.output_dir = os.path.join(
                self.top_dir,
                "psmac3-output_%s" % (datetime.datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f')))
            self.output_dir = create_output_directory(self.scenario,
                                                      run_id=self.run_id,
                                                      logger=self.logger)

        scen = copy.deepcopy(self.scenario)
        scen.output_dir_for_this_run = None
        scen.output_dir = None
        # parent process SMAC only used for validation purposes
        self.solver = SMAC4AC(scenario=scen,
                              tae_runner=self._tae,
                              rng=self.rng,
                              run_id=self.run_id,
                              **self.kwargs)
        for i in range(self.n_iterations):
            self.logger.info("=" * 120)
            self.logger.info("Hydra Iteration: %d", (i + 1))

            if i == 0:
                tae = self._tae
                tae_kwargs = self._tae_kwargs
            else:
                tae = ExecuteTARunHydra
                if self._tae_kwargs:
                    tae_kwargs = self._tae_kwargs
                else:
                    tae_kwargs = {}
                tae_kwargs['cost_oracle'] = self.cost_per_inst
            self.optimizer = PSMAC(
                scenario=self.scenario,
                run_id=self.run_id,
                rng=self.rng,
                tae=tae,
                tae_kwargs=tae_kwargs,
                shared_model=False,
                validate=True if self.val_set else False,
                n_optimizers=self.n_optimizers,
                val_set=self.val_set,
                n_incs=self.
                n_optimizers,  # return all configurations (unvalidated)
                **self.kwargs)
            self.optimizer.output_dir = self.output_dir
            incs = self.optimizer.optimize()
            cost_per_conf_v, val_ids, cost_per_conf_e, est_ids = self.optimizer.get_best_incumbents_ids(
                incs)
            if self.val_set:
                to_keep_ids = val_ids[:self.incs_per_round]
            else:
                to_keep_ids = est_ids[:self.incs_per_round]
            config_cost_per_inst = {}
            incs = incs[to_keep_ids]
            self.logger.info('Kept incumbents')
            for inc in incs:
                self.logger.info(inc)
                config_cost_per_inst[inc] = cost_per_conf_v[
                    inc] if self.val_set else cost_per_conf_e[inc]

            cur_portfolio_cost = self._update_portfolio(
                incs, config_cost_per_inst)
            if portfolio_cost <= cur_portfolio_cost:
                self.logger.info(
                    "No further progress (%f) --- terminate hydra",
                    portfolio_cost)
                break
            else:
                portfolio_cost = cur_portfolio_cost
                self.logger.info("Current pertfolio cost: %f", portfolio_cost)

            self.scenario.output_dir = os.path.join(
                self.top_dir,
                "psmac3-output_%s" % (datetime.datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f')))
            self.output_dir = create_output_directory(self.scenario,
                                                      run_id=self.run_id,
                                                      logger=self.logger)
        read(self.rh,
             os.path.join(self.top_dir, 'psmac3*', 'run_' + str(MAXINT)),
             self.scenario.cs, self.logger)
        self.rh.save_json(fn=os.path.join(
            self.top_dir, 'all_validated_runs_runhistory.json'),
                          save_external=True)
        with open(os.path.join(self.top_dir, 'portfolio.pkl'), 'wb') as fh:
            pickle.dump(self.portfolio, fh)
        self.logger.info("~" * 120)
        self.logger.info('Resulting Portfolio:')
        for configuration in self.portfolio:
            self.logger.info(str(configuration))
        self.logger.info("~" * 120)

        return self.portfolio

    def _update_portfolio(
            self, incs: np.ndarray, config_cost_per_inst: typing.Dict
    ) -> typing.Union[np.float, float]:
        """
        Validates all configurations (in incs) and determines which ones to add to the portfolio

        Parameters
        ----------
        incs: np.ndarray
            List of Configurations

        Returns
        -------
        cur_cost: typing.Union[np.float, float]
            The current cost of the portfolio

        """
        if self.val_set:  # we have validated data
            for kept in incs:
                if kept not in self.portfolio:
                    self.portfolio.append(kept)
                    cost_per_inst = config_cost_per_inst[kept]
                    if self.cost_per_inst:
                        if len(self.cost_per_inst) != len(cost_per_inst):
                            raise ValueError(
                                'Num validated Instances mismatch!')
                        else:
                            for key in cost_per_inst:
                                self.cost_per_inst[key] = min(
                                    self.cost_per_inst[key],
                                    cost_per_inst[key])
                    else:
                        self.cost_per_inst = cost_per_inst
            cur_cost = np.mean(list(
                self.cost_per_inst.values()))  # type: np.float
        else:  # No validated data. Set the mean to the approximated mean
            means = [
            ]  # can contain nans as not every instance was evaluated thus we should use nanmean to approximate
            for kept in incs:
                means.append(
                    np.nanmean(
                        list(
                            self.optimizer.rh.get_instance_costs_for_config(
                                kept).values())))
                self.portfolio.append(kept)
            if self.portfolio_cost:
                new_mean = self.portfolio_cost * (
                    len(self.portfolio) - len(incs)) / len(self.portfolio)
                new_mean += np.nansum(means)
            else:
                new_mean = np.mean(means)
            self.cost_per_inst = defaultdict(lambda: new_mean)
            cur_cost = new_mean

        self.portfolio_cost = cur_cost
        return cur_cost
Пример #3
0
    def optimize(self) -> typing.List[Configuration]:
        """
        Optimizes the algorithm provided in scenario (given in constructor)

        Returns
        -------
        portfolio : typing.List[Configuration]
            Portfolio of found configurations

        """
        # Setup output directory
        self.portfolio = []
        portfolio_cost = np.inf
        if self.output_dir is None:
            self.top_dir = "hydra-output_%s" % (
                datetime.datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f'))
            self.scenario.output_dir = os.path.join(
                self.top_dir,
                "psmac3-output_%s" % (datetime.datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f')))
            self.output_dir = create_output_directory(self.scenario,
                                                      run_id=self.run_id,
                                                      logger=self.logger)

        scen = copy.deepcopy(self.scenario)
        scen.output_dir_for_this_run = None
        scen.output_dir = None
        # parent process SMAC only used for validation purposes
        self.solver = SMAC4AC(scenario=scen,
                              tae_runner=self._tae,
                              rng=self.rng,
                              run_id=self.run_id,
                              **self.kwargs)
        for i in range(self.n_iterations):
            self.logger.info("=" * 120)
            self.logger.info("Hydra Iteration: %d", (i + 1))

            if i == 0:
                tae = self._tae
                tae_kwargs = self._tae_kwargs
            else:
                tae = ExecuteTARunHydra
                if self._tae_kwargs:
                    tae_kwargs = self._tae_kwargs
                else:
                    tae_kwargs = {}
                tae_kwargs['cost_oracle'] = self.cost_per_inst
            self.optimizer = PSMAC(
                scenario=self.scenario,
                run_id=self.run_id,
                rng=self.rng,
                tae=tae,
                tae_kwargs=tae_kwargs,
                shared_model=False,
                validate=True if self.val_set else False,
                n_optimizers=self.n_optimizers,
                val_set=self.val_set,
                n_incs=self.
                n_optimizers,  # return all configurations (unvalidated)
                **self.kwargs)
            self.optimizer.output_dir = self.output_dir
            incs = self.optimizer.optimize()
            cost_per_conf_v, val_ids, cost_per_conf_e, est_ids = self.optimizer.get_best_incumbents_ids(
                incs)
            if self.val_set:
                to_keep_ids = val_ids[:self.incs_per_round]
            else:
                to_keep_ids = est_ids[:self.incs_per_round]
            config_cost_per_inst = {}
            incs = incs[to_keep_ids]
            self.logger.info('Kept incumbents')
            for inc in incs:
                self.logger.info(inc)
                config_cost_per_inst[inc] = cost_per_conf_v[
                    inc] if self.val_set else cost_per_conf_e[inc]

            cur_portfolio_cost = self._update_portfolio(
                incs, config_cost_per_inst)
            if portfolio_cost <= cur_portfolio_cost:
                self.logger.info(
                    "No further progress (%f) --- terminate hydra",
                    portfolio_cost)
                break
            else:
                portfolio_cost = cur_portfolio_cost
                self.logger.info("Current pertfolio cost: %f", portfolio_cost)

            self.scenario.output_dir = os.path.join(
                self.top_dir,
                "psmac3-output_%s" % (datetime.datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f')))
            self.output_dir = create_output_directory(self.scenario,
                                                      run_id=self.run_id,
                                                      logger=self.logger)
        read(self.rh,
             os.path.join(self.top_dir, 'psmac3*', 'run_' + str(MAXINT)),
             self.scenario.cs, self.logger)
        self.rh.save_json(fn=os.path.join(
            self.top_dir, 'all_validated_runs_runhistory.json'),
                          save_external=True)
        with open(os.path.join(self.top_dir, 'portfolio.pkl'), 'wb') as fh:
            pickle.dump(self.portfolio, fh)
        self.logger.info("~" * 120)
        self.logger.info('Resulting Portfolio:')
        for configuration in self.portfolio:
            self.logger.info(str(configuration))
        self.logger.info("~" * 120)

        return self.portfolio
Пример #4
0
    def fit(self, dataset: BinaryDataset,
            clustering_algs: List[str],
            feature_selection_algs: List[str],
            n_evaluations: int = 30,
            cutoff_time=20,
            evaluator: Callable = Measures.silhouette,
            experiments_dir: str = "../../experiments",
            n_optimizers=2,
            top_algorithms_count=5) -> List[str]:

        cs = build_config_space(clustering_ls=clustering_algs, feature_selection_ls=feature_selection_algs)

        config_name: str = "mixed"
        if len(clustering_algs) == 1 and len(feature_selection_algs) == 1:
            config_name: str = f"{feature_selection_algs[0]}_{clustering_algs[0]}"
        base_dir_name = _create_smac_directory(experiments_dir, evaluator.__name__, config_name)

        scenario_params: dict = {
            "run_obj": "quality",
            "runcount-limit": n_evaluations,
            "cutoff_time": cutoff_time,
            "cs": cs,
            "deterministic": "false",
            "output_dir": base_dir_name,
            "abort_on_first_run_crash": False,
            "shared_model": True,
            "input_psmac_dirs": _create_psmac_dirs(base_dir_name, n_optimizers)
        }

        scenario = Scenario(scenario_params)
        dataset_content = dataset.load_dataset()

        def fit_models(cfg: dict, data: np.ndarray):
            feature_selection_alg = Mapper.get_class(cfg["feature_selection_choice"])

            cfg_feature_selection: dict = {
                decode_parameter(k, feature_selection_alg.name): v for k, v in cfg.items()
                if decode_parameter(k, feature_selection_alg.name) is not None
            }

            feature_selection_model = feature_selection_alg.model(**cfg_feature_selection)
            selected_data: np.ndarray = feature_selection_model.fit_transform(data)

            clustering_alg = Mapper.get_class(cfg["clustering_choice"])
            cfg_clustering: dict = {
                decode_parameter(k, clustering_alg.name): v for k, v in cfg.items()
                if decode_parameter(k, clustering_alg.name) is not None
            }

            clustering_model = clustering_alg.model(**cfg_clustering)
            clustering_result = clustering_model.fit_predict(selected_data)

            return feature_selection_model, clustering_model, clustering_result

        def cfg_to_dict(cfg):
            cfg = {k: cfg[k] for k in cfg if cfg[k]}
            return {k: v for k, v in cfg.items() if v is not None}

        def evaluate_model(cfg):
            cfg_dict = cfg_to_dict(cfg)
            _, _, y_pred = fit_models(cfg_dict, dataset_content)
            if len(np.unique(y_pred)) < 2:
                return np.inf
            else:
                return evaluator(dataset_content, y_pred)

        optimal_config = None
        smac = SMAC(
            scenario=scenario,
            rng=np.random.RandomState(42),
            tae=evaluate_model,
            n_optimizers=n_optimizers,
            validate=False,
            n_incs=top_algorithms_count
        )

        # for some reason smac sets output dir to None in constructor, so we inject it manually
        smac.output_dir = base_dir_name
        scenario.output_dir = base_dir_name

        self._smac = smac
        optimal_configs: List[Configuration] = self._smac.optimize()
        best_algorithms: List[str] = []
        for i, optimal_config in enumerate(optimal_configs):
            dict_config = cfg_to_dict(optimal_config)
            feature_selection_model, clustering_model, clustering_result = \
                fit_models(dict_config, dataset_content)

            if len(np.unique(clustering_result)) < 2:
                measure_value = np.inf
            else:
                measure_value = evaluator(dataset_content, clustering_result)

            result = {
                "optimal_config": dict_config,
                "smac": self._smac,
                "feature_selection_model": feature_selection_model,
                "clustering_model": clustering_model,
                "clustering_result": clustering_result,
                "measure_value": measure_value
            }
            _save_clustering_result(result, f"{base_dir_name}", name=f"config_{i}")
            best_algorithms.append(_encode(dict_config["clustering_choice"], dict_config["feature_selection_choice"]))
        _save_best_algorithms(experiments_dir, best_algorithms, clustering_algs, feature_selection_algs)
        return best_algorithms
Пример #5
0
    def main_cli(
        self,
        commandline_arguments: typing.Optional[typing.List[str]] = None
    ) -> None:
        """Main function of SMAC for CLI interface"""
        self.logger.info("SMAC call: %s" % (" ".join(sys.argv)))

        cmd_reader = CMDReader()
        kwargs = {}
        if commandline_arguments:
            kwargs['commandline_arguments'] = commandline_arguments
        main_args_, smac_args_, scen_args_ = cmd_reader.read_cmd(**kwargs)

        root_logger = logging.getLogger()
        root_logger.setLevel(main_args_.verbose_level)
        logger_handler = logging.StreamHandler(stream=sys.stdout)
        if root_logger.level >= logging.INFO:
            formatter = logging.Formatter("%(levelname)s:\t%(message)s")
        else:
            formatter = logging.Formatter(
                "%(asctime)s:%(levelname)s:%(name)s:\t%(message)s",
                "%Y-%m-%d %H:%M:%S")
        logger_handler.setFormatter(formatter)
        root_logger.addHandler(logger_handler)
        # remove default handler
        if len(root_logger.handlers) > 1:
            root_logger.removeHandler(root_logger.handlers[0])

        # Create defaults
        rh = None
        initial_configs = None
        stats = None
        incumbent = None

        # Create scenario-object
        scenario = {}
        scenario.update(vars(smac_args_))
        scenario.update(vars(scen_args_))
        scen = Scenario(scenario=scenario)

        # Restore state
        if main_args_.restore_state:
            root_logger.debug("Restoring state from %s...",
                              main_args_.restore_state)
            restore_state = main_args_.restore_state
            rh, stats, traj_list_aclib, traj_list_old = self.restore_state(
                scen, restore_state)

            scen.output_dir_for_this_run = create_output_directory(
                scen,
                main_args_.seed,
                root_logger,
            )
            scen.write()
            incumbent = self.restore_state_after_output_dir(
                scen, stats, traj_list_aclib, traj_list_old)

        if main_args_.warmstart_runhistory:
            rh = RunHistory()

            scen, rh = merge_foreign_data_from_file(
                scenario=scen,
                runhistory=rh,
                in_scenario_fn_list=main_args_.warmstart_scenario,
                in_runhistory_fn_list=main_args_.warmstart_runhistory,
                cs=scen.cs,  # type: ignore[attr-defined] # noqa F821
            )

        if main_args_.warmstart_incumbent:
            initial_configs = [scen.cs.get_default_configuration()
                               ]  # type: ignore[attr-defined] # noqa F821
            for traj_fn in main_args_.warmstart_incumbent:
                trajectory = TrajLogger.read_traj_aclib_format(
                    fn=traj_fn,
                    cs=scen.cs,  # type: ignore[attr-defined] # noqa F821
                )
                initial_configs.append(trajectory[-1]["incumbent"])

        if main_args_.mode == "SMAC4AC":
            optimizer = SMAC4AC(scenario=scen,
                                rng=np.random.RandomState(main_args_.seed),
                                runhistory=rh,
                                initial_configurations=initial_configs,
                                stats=stats,
                                restore_incumbent=incumbent,
                                run_id=main_args_.seed)
        elif main_args_.mode == "SMAC4HPO":
            optimizer = SMAC4HPO(scenario=scen,
                                 rng=np.random.RandomState(main_args_.seed),
                                 runhistory=rh,
                                 initial_configurations=initial_configs,
                                 stats=stats,
                                 restore_incumbent=incumbent,
                                 run_id=main_args_.seed)
        elif main_args_.mode == "SMAC4BB":
            optimizer = SMAC4BB(scenario=scen,
                                rng=np.random.RandomState(main_args_.seed),
                                runhistory=rh,
                                initial_configurations=initial_configs,
                                stats=stats,
                                restore_incumbent=incumbent,
                                run_id=main_args_.seed)
        elif main_args_.mode == "ROAR":
            optimizer = ROAR(scenario=scen,
                             rng=np.random.RandomState(main_args_.seed),
                             runhistory=rh,
                             initial_configurations=initial_configs,
                             run_id=main_args_.seed)
        elif main_args_.mode == "Hydra":
            optimizer = Hydra(
                scenario=scen,
                rng=np.random.RandomState(main_args_.seed),
                runhistory=rh,
                initial_configurations=initial_configs,
                stats=stats,
                restore_incumbent=incumbent,
                run_id=main_args_.seed,
                random_configuration_chooser=main_args_.
                random_configuration_chooser,
                n_iterations=main_args_.hydra_iterations,
                val_set=main_args_.hydra_validation,
                incs_per_round=main_args_.hydra_incumbents_per_round,
                n_optimizers=main_args_.hydra_n_optimizers)
        elif main_args_.mode == "PSMAC":
            optimizer = PSMAC(
                scenario=scen,
                rng=np.random.RandomState(main_args_.seed),
                run_id=main_args_.seed,
                shared_model=smac_args_.shared_model,
                validate=main_args_.psmac_validate,
                n_optimizers=main_args_.hydra_n_optimizers,
                n_incs=main_args_.hydra_incumbents_per_round,
            )
        try:
            optimizer.optimize()
        except (TAEAbortException, FirstRunCrashedException) as err:
            self.logger.error(err)