def _build_matrix(self, run_dict: typing.Mapping[RunKey, RunValue], runhistory: RunHistory, instances: typing.List[str] = None, return_time_as_y: bool = False, store_statistics: bool = False): """TODO""" if return_time_as_y: raise NotImplementedError() if store_statistics: raise NotImplementedError() # First build nan-matrix of size #configs x #params+1 n_rows = len(run_dict) n_cols = self.num_params X = np.ones([n_rows, n_cols + self.n_feats]) * np.nan y = np.ones([n_rows, 2]) # Then populate matrix for row, (key, run) in enumerate(run_dict.items()): # Scaling is automatically done in configSpace conf = runhistory.ids_config[key.config_id] conf_vector = convert_configurations_to_array([conf])[0] if self.n_feats: feats = self.instance_features[key.instance_id] X[row, :] = np.hstack((conf_vector, feats)) else: X[row, :] = conf_vector y[row, 0] = run.cost y[row, 1] = 1 + run.time y = self.transform_response_values(values=y) return X, y
def _build_matrix(self, run_dict: typing.Mapping[RunKey, RunValue], runhistory: RunHistory, instances: list = None, return_time_as_y: bool = False, store_statistics: bool = False): """"Builds X,y matrixes from selected runs from runhistory Parameters ---------- run_dict: dict: RunKey -> RunValue dictionary from RunHistory.RunKey to RunHistory.RunValue runhistory: RunHistory runhistory object instances: list list of instances return_time_as_y: bool Return the time instead of cost as y value. Necessary to access the raw y values for imputation. store_statistics: bool Whether to store statistics about the data (to be used at subsequent calls) Returns ------- X: np.ndarray Y: np.ndarray """ # First build nan-matrix of size #configs x #params+1 n_rows = len(run_dict) n_cols = self.num_params X = np.ones([n_rows, n_cols + self.n_feats]) * np.nan y = np.ones([n_rows, 1]) # Then populate matrix for row, (key, run) in enumerate(run_dict.items()): # Scaling is automatically done in configSpace conf = runhistory.ids_config[key.config_id] conf_vector = convert_configurations_to_array([conf])[0] if self.n_feats: feats = self.instance_features[key.instance_id] X[row, :] = np.hstack((conf_vector, feats)) else: X[row, :] = conf_vector # run_array[row, -1] = instances[row] if return_time_as_y: y[row, 0] = run.time else: y[row, 0] = run.cost if y.size > 0: if store_statistics: self.perc = np.percentile(y, self.scale_perc) self.min_y = np.min(y) self.max_y = np.max(y) y = self.transform_response_values(values=y) return X, y
def _get_initial_points(self, num_points, runhistory, additional_start_points): if runhistory.empty(): init_points = self.config_space.sample_configuration( size=num_points) else: # initiate local search configs_previous_runs = runhistory.get_all_configs() # configurations with the highest previous EI configs_previous_runs_sorted = self._sort_configs_by_acq_value( configs_previous_runs) configs_previous_runs_sorted = [ conf[1] for conf in configs_previous_runs_sorted[:num_points] ] # configurations with the lowest predictive cost, check for None to make unit tests work if self.acquisition_function.model is not None: conf_array = convert_configurations_to_array( configs_previous_runs) costs = self.acquisition_function.model.predict_marginalized_over_instances( conf_array)[0] # From here # http://stackoverflow.com/questions/20197990/how-to-make-argsort-result-to-be-random-between-equal-values random = self.rng.rand(len(costs)) # Last column is primary sort key! indices = np.lexsort((random.flatten(), costs.flatten())) # Cannot use zip here because the indices array cannot index the # rand_configs list, because the second is a pure python list configs_previous_runs_sorted_by_cost = [ configs_previous_runs[ind] for ind in indices ][:num_points] else: configs_previous_runs_sorted_by_cost = [] if additional_start_points is not None: additional_start_points = [ asp[1] for asp in additional_start_points[:num_points] ] else: additional_start_points = [] init_points = [] init_points_as_set = set() for cand in itertools.chain( configs_previous_runs_sorted, configs_previous_runs_sorted_by_cost, additional_start_points, ): if cand not in init_points_as_set: init_points.append(cand) init_points_as_set.add(cand) return init_points
def validate_epm( self, config_mode: Union[str, typing.List[Configuration]] = 'def', instance_mode: Union[str, typing.List[str]] = 'test', repetitions: int = 1, runhistory: RunHistory = None, output_fn="", reuse_epm=True, ) -> RunHistory: """ Use EPM to predict costs/runtimes for unknown config/inst-pairs. side effect: if output is specified, saves runhistory to specified output directory. Parameters ---------- output_fn: str path to runhistory to be saved. if the suffix is not '.json', will be interpreted as directory and filename will be 'validated_runhistory_EPM.json' config_mode: str or list<Configuration> string or directly a list of Configuration, string from [def, inc, def+inc, wallclock_time, cpu_time, all]. time evaluates at cpu- or wallclock-timesteps of: [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time instance_mode: str or list<str> what instances to use for validation, either from [train, test, train+test] or directly a list of instances repetitions: int number of repetitions in nondeterministic algorithms runhistory: RunHistory optional, RunHistory-object to reuse runs reuse_epm: bool if true (and if `self.epm`), reuse epm to validate runs Returns ------- runhistory: RunHistory runhistory with predicted runs """ if not isinstance(runhistory, RunHistory) and (self.epm is None or reuse_epm is False): raise ValueError( "No runhistory specified for validating with EPM!") elif reuse_epm is False or self.epm is None: # Create RandomForest types, bounds = get_types(self.scen.cs, self.scen.feature_array) self.epm = RandomForestWithInstances( configspace=self.scen.cs, types=types, bounds=bounds, instance_features=self.scen.feature_array, seed=self.rng.randint(MAXINT), ratio_features=1.0, ) # Use imputor if objective is runtime imputor = None impute_state = None impute_censored_data = False if self.scen.run_obj == 'runtime': threshold = self.scen.cutoff * self.scen.par_factor imputor = RFRImputator(rng=self.rng, cutoff=self.scen.cutoff, threshold=threshold, model=self.epm) impute_censored_data = True impute_state = [StatusType.CAPPED] # Transform training data (from given rh) rh2epm = RunHistory2EPM4Cost( num_params=len(self.scen.cs.get_hyperparameters()), scenario=self.scen, rng=self.rng, impute_censored_data=impute_censored_data, imputor=imputor, impute_state=impute_state) X, y = rh2epm.transform(runhistory) self.logger.debug("Training model with data of shape X: %s, y:%s", str(X.shape), str(y.shape)) # Train random forest self.epm.train(X, y) # Predict desired runs runs, rh_epm = self._get_runs(config_mode, instance_mode, repetitions, runhistory) feature_array_size = len(self.scen.cs.get_hyperparameters()) if self.scen.feature_array is not None: feature_array_size += self.scen.feature_array.shape[1] X_pred = np.empty((len(runs), feature_array_size)) for idx, run in enumerate(runs): if self.scen.feature_array is not None and run.inst is not None: X_pred[idx] = np.hstack([ convert_configurations_to_array([run.config])[0], self.scen.feature_dict[run.inst] ]) else: X_pred[idx] = convert_configurations_to_array([run.config])[0] self.logger.debug("Predicting desired %d runs, data has shape %s", len(runs), str(X_pred.shape)) y_pred = self.epm.predict(X_pred) # Add runs to runhistory for run, pred in zip(runs, y_pred[0]): rh_epm.add( config=run.config, cost=float(pred), time=float(pred), status=StatusType.SUCCESS, instance_id=run.inst, seed=-1, additional_info={"additional_info": "ESTIMATED USING EPM!"}) if output_fn: self._save_results(rh_epm, output_fn, backup_fn="validated_runhistory_EPM.json") return rh_epm