示例#1
0
    def _build_matrix(self,
                      run_dict: typing.Mapping[RunKey, RunValue],
                      runhistory: RunHistory,
                      instances: typing.List[str] = None,
                      return_time_as_y: bool = False,
                      store_statistics: bool = False):
        """TODO"""
        if return_time_as_y:
            raise NotImplementedError()
        if store_statistics:
            raise NotImplementedError()

        # First build nan-matrix of size #configs x #params+1
        n_rows = len(run_dict)
        n_cols = self.num_params
        X = np.ones([n_rows, n_cols + self.n_feats]) * np.nan
        y = np.ones([n_rows, 2])

        # Then populate matrix
        for row, (key, run) in enumerate(run_dict.items()):
            # Scaling is automatically done in configSpace
            conf = runhistory.ids_config[key.config_id]
            conf_vector = convert_configurations_to_array([conf])[0]
            if self.n_feats:
                feats = self.instance_features[key.instance_id]
                X[row, :] = np.hstack((conf_vector, feats))
            else:
                X[row, :] = conf_vector
            y[row, 0] = run.cost
            y[row, 1] = 1 + run.time

        y = self.transform_response_values(values=y)

        return X, y
示例#2
0
    def _build_matrix(self,
                      run_dict: typing.Mapping[RunKey, RunValue],
                      runhistory: RunHistory,
                      instances: list = None,
                      return_time_as_y: bool = False,
                      store_statistics: bool = False):
        """"Builds X,y matrixes from selected runs from runhistory

        Parameters
        ----------
        run_dict: dict: RunKey -> RunValue
            dictionary from RunHistory.RunKey to RunHistory.RunValue
        runhistory: RunHistory
            runhistory object
        instances: list
            list of instances
        return_time_as_y: bool
            Return the time instead of cost as y value. Necessary to access the raw y values for imputation.
        store_statistics: bool
            Whether to store statistics about the data (to be used at subsequent calls)

        Returns
        -------
        X: np.ndarray
        Y: np.ndarray
        """

        # First build nan-matrix of size #configs x #params+1
        n_rows = len(run_dict)
        n_cols = self.num_params
        X = np.ones([n_rows, n_cols + self.n_feats]) * np.nan
        y = np.ones([n_rows, 1])

        # Then populate matrix
        for row, (key, run) in enumerate(run_dict.items()):
            # Scaling is automatically done in configSpace
            conf = runhistory.ids_config[key.config_id]
            conf_vector = convert_configurations_to_array([conf])[0]
            if self.n_feats:
                feats = self.instance_features[key.instance_id]
                X[row, :] = np.hstack((conf_vector, feats))
            else:
                X[row, :] = conf_vector
            # run_array[row, -1] = instances[row]
            if return_time_as_y:
                y[row, 0] = run.time
            else:
                y[row, 0] = run.cost

        if y.size > 0:
            if store_statistics:
                self.perc = np.percentile(y, self.scale_perc)
                self.min_y = np.min(y)
                self.max_y = np.max(y)
            y = self.transform_response_values(values=y)

        return X, y
示例#3
0
    def _get_initial_points(self, num_points, runhistory,
                            additional_start_points):

        if runhistory.empty():
            init_points = self.config_space.sample_configuration(
                size=num_points)
        else:
            # initiate local search
            configs_previous_runs = runhistory.get_all_configs()

            # configurations with the highest previous EI
            configs_previous_runs_sorted = self._sort_configs_by_acq_value(
                configs_previous_runs)
            configs_previous_runs_sorted = [
                conf[1] for conf in configs_previous_runs_sorted[:num_points]
            ]

            # configurations with the lowest predictive cost, check for None to make unit tests work
            if self.acquisition_function.model is not None:
                conf_array = convert_configurations_to_array(
                    configs_previous_runs)
                costs = self.acquisition_function.model.predict_marginalized_over_instances(
                    conf_array)[0]
                # From here
                # http://stackoverflow.com/questions/20197990/how-to-make-argsort-result-to-be-random-between-equal-values
                random = self.rng.rand(len(costs))
                # Last column is primary sort key!
                indices = np.lexsort((random.flatten(), costs.flatten()))

                # Cannot use zip here because the indices array cannot index the
                # rand_configs list, because the second is a pure python list
                configs_previous_runs_sorted_by_cost = [
                    configs_previous_runs[ind] for ind in indices
                ][:num_points]
            else:
                configs_previous_runs_sorted_by_cost = []

            if additional_start_points is not None:
                additional_start_points = [
                    asp[1] for asp in additional_start_points[:num_points]
                ]
            else:
                additional_start_points = []

            init_points = []
            init_points_as_set = set()
            for cand in itertools.chain(
                    configs_previous_runs_sorted,
                    configs_previous_runs_sorted_by_cost,
                    additional_start_points,
            ):
                if cand not in init_points_as_set:
                    init_points.append(cand)
                    init_points_as_set.add(cand)

        return init_points
示例#4
0
    def validate_epm(
        self,
        config_mode: Union[str, typing.List[Configuration]] = 'def',
        instance_mode: Union[str, typing.List[str]] = 'test',
        repetitions: int = 1,
        runhistory: RunHistory = None,
        output_fn="",
        reuse_epm=True,
    ) -> RunHistory:
        """
        Use EPM to predict costs/runtimes for unknown config/inst-pairs.

        side effect: if output is specified, saves runhistory to specified
        output directory.

        Parameters
        ----------
        output_fn: str
            path to runhistory to be saved. if the suffix is not '.json', will
            be interpreted as directory and filename will be
            'validated_runhistory_EPM.json'
        config_mode: str or list<Configuration>
            string or directly a list of Configuration, string from [def, inc, def+inc, wallclock_time, cpu_time, all].
            time evaluates at cpu- or wallclock-timesteps of:
            [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time
        instance_mode: str or list<str>
            what instances to use for validation, either from
            [train, test, train+test] or directly a list of instances
        repetitions: int
            number of repetitions in nondeterministic algorithms
        runhistory: RunHistory
            optional, RunHistory-object to reuse runs
        reuse_epm: bool
            if true (and if `self.epm`), reuse epm to validate runs

        Returns
        -------
        runhistory: RunHistory
            runhistory with predicted runs
        """
        if not isinstance(runhistory, RunHistory) and (self.epm is None
                                                       or reuse_epm is False):
            raise ValueError(
                "No runhistory specified for validating with EPM!")
        elif reuse_epm is False or self.epm is None:
            # Create RandomForest
            types, bounds = get_types(self.scen.cs, self.scen.feature_array)
            self.epm = RandomForestWithInstances(
                configspace=self.scen.cs,
                types=types,
                bounds=bounds,
                instance_features=self.scen.feature_array,
                seed=self.rng.randint(MAXINT),
                ratio_features=1.0,
            )
            # Use imputor if objective is runtime
            imputor = None
            impute_state = None
            impute_censored_data = False
            if self.scen.run_obj == 'runtime':
                threshold = self.scen.cutoff * self.scen.par_factor
                imputor = RFRImputator(rng=self.rng,
                                       cutoff=self.scen.cutoff,
                                       threshold=threshold,
                                       model=self.epm)
                impute_censored_data = True
                impute_state = [StatusType.CAPPED]
            # Transform training data (from given rh)
            rh2epm = RunHistory2EPM4Cost(
                num_params=len(self.scen.cs.get_hyperparameters()),
                scenario=self.scen,
                rng=self.rng,
                impute_censored_data=impute_censored_data,
                imputor=imputor,
                impute_state=impute_state)
            X, y = rh2epm.transform(runhistory)
            self.logger.debug("Training model with data of shape X: %s, y:%s",
                              str(X.shape), str(y.shape))
            # Train random forest
            self.epm.train(X, y)

        # Predict desired runs
        runs, rh_epm = self._get_runs(config_mode, instance_mode, repetitions,
                                      runhistory)

        feature_array_size = len(self.scen.cs.get_hyperparameters())
        if self.scen.feature_array is not None:
            feature_array_size += self.scen.feature_array.shape[1]

        X_pred = np.empty((len(runs), feature_array_size))
        for idx, run in enumerate(runs):
            if self.scen.feature_array is not None and run.inst is not None:
                X_pred[idx] = np.hstack([
                    convert_configurations_to_array([run.config])[0],
                    self.scen.feature_dict[run.inst]
                ])
            else:
                X_pred[idx] = convert_configurations_to_array([run.config])[0]
        self.logger.debug("Predicting desired %d runs, data has shape %s",
                          len(runs), str(X_pred.shape))

        y_pred = self.epm.predict(X_pred)

        # Add runs to runhistory
        for run, pred in zip(runs, y_pred[0]):
            rh_epm.add(
                config=run.config,
                cost=float(pred),
                time=float(pred),
                status=StatusType.SUCCESS,
                instance_id=run.inst,
                seed=-1,
                additional_info={"additional_info": "ESTIMATED USING EPM!"})

        if output_fn:
            self._save_results(rh_epm,
                               output_fn,
                               backup_fn="validated_runhistory_EPM.json")
        return rh_epm