def set_probabilities_in_cs(self, cs: ConfigurationSpace, relied2models: Dict[str, List[str]], relied2AllModels: Dict[str, List[str]], all_models: List[str], **kwargs): estimator = cs.get_hyperparameter("estimator:__choice__") probabilities = [] model2prob = {} L = 0 for rely_model in relied2models: cur_models = relied2models[rely_model] L += len(cur_models) for model in cur_models: model2prob[model] = kwargs[rely_model] / len(cur_models) p_rest = (1 - sum(model2prob.values())) / (len(all_models) - L) for model in estimator.choices: probabilities.append(model2prob.get(model, p_rest)) estimator.probabilities = probabilities default_estimator_choice = None for models in relied2models.values(): if models: default_estimator_choice = models[0] estimator.default_value = default_estimator_choice for rely_model, path in RelyModels.info: forbid_eq_value = path[-1] path = path[:-1] forbid_eq_key = ":".join(path + ["__choice__"]) forbid_eq_key_hp = cs.get_hyperparameter(forbid_eq_key) forbid_in_key = "estimator:__choice__" hit = relied2AllModels.get(rely_model) if not hit: choices = list(forbid_eq_key_hp.choices) choices.remove(forbid_eq_value) forbid_eq_key_hp.choices = tuple(choices) forbid_eq_key_hp.default_value = choices[0] forbid_eq_key_hp.probabilities = [1 / len(choices) ] * len(choices) # fixme 最后我放弃了在这上面进行修改,在hdl部分就做了预处理 continue forbid_in_value = list(set(all_models) - set(hit)) # 只选择了boost模型 if not forbid_in_value: continue choices = forbid_eq_key_hp.choices probabilities = [] p: float = kwargs[rely_model] p_rest = (1 - p) * (len(choices) - 1) for choice in choices: if choice == forbid_eq_value: probabilities.append(p) else: probabilities.append(p_rest) forbid_eq_key_hp.probabilities = probabilities cs.add_forbidden_clause( ForbiddenAndConjunction( ForbiddenEqualsClause(forbid_eq_key_hp, forbid_eq_value), ForbiddenInClause(cs.get_hyperparameter(forbid_in_key), forbid_in_value), ))
def _construct_in_condition( condition: Dict, cs: ConfigurationSpace, ) -> InCondition: return InCondition( child=cs.get_hyperparameter(condition['child']), parent=cs.get_hyperparameter(condition['parent']), values=condition['values'], )
def _construct_lt_condition( condition: Dict, cs: ConfigurationSpace, ) -> LessThanCondition: return LessThanCondition( child=cs.get_hyperparameter(condition['child']), parent=cs.get_hyperparameter(condition['parent']), value=condition['value'], )
def _construct_neq_condition( condition: Dict, cs: ConfigurationSpace, ) -> NotEqualsCondition: return NotEqualsCondition( child=cs.get_hyperparameter(condition['child']), parent=cs.get_hyperparameter(condition['parent']), value=condition['value'], )
def add_params(cs: ConfigurationSpace): ''' adds parameters to ConfigurationSpace ''' selector = cs.get_hyperparameter("selector") regressor = cs.get_hyperparameter("regressor") if "PairwiseRegressor" in selector.choices: cond = InCondition(child=regressor, parent=selector, values=["PairwiseRegressor"]) cs.add_condition(cond)
def add_params(cs: ConfigurationSpace): ''' adds parameters to ConfigurationSpace ''' selector = cs.get_hyperparameter("selector") classifier = cs.get_hyperparameter("classifier") if "MultiClassifier" in selector.choices: cond = InCondition(child=classifier, parent=selector, values=["MultiClassifier"]) cs.add_condition(cond)
def _construct_forbidden_in( clause: Dict, cs: ConfigurationSpace, ) -> ForbiddenEqualsClause: return ForbiddenInClause(hyperparameter=cs.get_hyperparameter( clause['name']), values=clause['values'])
def alternative_configuration_recovery(config_list: typing.List[str], cs: ConfigurationSpace): """ Used to recover ints and bools as categoricals or constants from trajectory """ config_dict = {} for param in config_list: k,v = param.split("=") v = v.strip("'") hp = cs.get_hyperparameter(k) if isinstance(hp, FloatHyperparameter): v = float(v) elif isinstance(hp, IntegerHyperparameter): v = int(v) ################# DIFFERENCE: ################ elif isinstance(hp, CategoricalHyperparameter) or isinstance(hp, Constant): if isinstance(hp.default_value, bool): v = True if v == 'True' else False elif isinstance(hp.default_value, int): v = int(v) elif isinstance(hp.default_value, float): v = float(v) else: v = v ############################################## config_dict[k] = v config = Configuration(configuration_space=cs, values=config_dict) config.origin = "External Trajectory" return config
def _convert_dict_to_config(config_list: typing.List[str], cs: ConfigurationSpace): # CAN BE DONE IN CONFIGSPACE """Since we save a configurations in a dictionary str->str we have to try to figure out the type (int, float, str) of each parameter value Parameters ---------- config_list: typing.List[str] Configuration as a list of "str='str'" cs: ConfigurationSpace Configuration Space to translate dict object into Confiuration object """ config_dict = {} for param in config_list: k, v = param.split("=") v = v.strip("'") hp = cs.get_hyperparameter(k) if isinstance(hp, FloatHyperparameter): v = float(v) elif isinstance(hp, IntegerHyperparameter): v = int(v) config_dict[k] = v config = Configuration(configuration_space=cs, values=config_dict) config.origin = "External Trajectory" return config
def _construct_forbidden_in( clause: Dict, cs: ConfigurationSpace, ) -> ForbiddenEqualsClause: return ForbiddenInClause( hyperparameter=cs.get_hyperparameter(clause['name']), values=clause['values'] )
def add_params(cs: ConfigurationSpace): ''' adds parameters to ConfigurationSpace ''' try: classifier = cs.get_hyperparameter("classifier") classifier.choices.append("RandomForest") except KeyError: classifier = CategoricalHyperparameter( "classifier", choices=["RandomForest"], default="RandomForest") cs.add_hyperparameter(classifier) n_estimators = UniformIntegerHyperparameter( name="rf:n_estimators", lower=10, upper=100, default=10, log=True) cs.add_hyperparameter(n_estimators) criterion = CategoricalHyperparameter( name="rf:criterion", choices=["gini", "entropy"], default="gini") cs.add_hyperparameter(criterion) max_features = CategoricalHyperparameter( name="rf:max_features", choices=["sqrt", "log2", None], default="sqrt") cs.add_hyperparameter(max_features) max_depth = UniformIntegerHyperparameter( name="rf:max_depth", lower=10, upper=2**31, default=2**31, log=True) cs.add_hyperparameter(max_depth) min_samples_split = UniformIntegerHyperparameter( name="rf:min_samples_split", lower=2, upper=100, default=2, log=True) cs.add_hyperparameter(min_samples_split) min_samples_leaf = UniformIntegerHyperparameter( name="rf:min_samples_leaf", lower=2, upper=100, default=10, log=True) cs.add_hyperparameter(min_samples_leaf) bootstrap = CategoricalHyperparameter( name="rf:bootstrap", choices=[True, False], default=True) cs.add_hyperparameter(bootstrap) cond = InCondition( child=n_estimators, parent=classifier, values=["RandomForest"]) cs.add_condition(cond) cond = InCondition( child=criterion, parent=classifier, values=["RandomForest"]) cs.add_condition(cond) cond = InCondition( child=max_features, parent=classifier, values=["RandomForest"]) cs.add_condition(cond) cond = InCondition( child=max_depth, parent=classifier, values=["RandomForest"]) cs.add_condition(cond) cond = InCondition( child=min_samples_split, parent=classifier, values=["RandomForest"]) cs.add_condition(cond) cond = InCondition( child=min_samples_leaf, parent=classifier, values=["RandomForest"]) cs.add_condition(cond) cond = InCondition( child=bootstrap, parent=classifier, values=["RandomForest"]) cs.add_condition(cond)
def add_params(cs: ConfigurationSpace): ''' adds parameters to ConfigurationSpace ''' try: selector = cs.get_hyperparameter("selector") selector.choices.append("PairwiseClassifier") except KeyError: selector = CategoricalHyperparameter( "selector", choices=["PairwiseClassifier"], default="PairwiseClassifier") cs.add_hyperparameter(selector) classifier = cs.get_hyperparameter("classifier") cond = InCondition(child=classifier, parent=selector, values=["PairwiseClassifier"]) cs.add_condition(cond)
def _convert_dict_to_config(config_list: typing.List[str], cs: ConfigurationSpace) -> Configuration: """Since we save a configurations in a dictionary str->str we have to try to figure out the type (int, float, str) of each parameter value Parameters ---------- config_list: typing.List[str] Configuration as a list of "str='str'" cs: ConfigurationSpace Configuration Space to translate dict object into Confiuration object """ config_dict = {} v = '' # type: typing.Union[str, float, int, bool] for param in config_list: k, v = param.split("=") v = v.strip("'") hp = cs.get_hyperparameter(k) if isinstance(hp, FloatHyperparameter): v = float(v) elif isinstance(hp, IntegerHyperparameter): v = int(v) elif isinstance(hp, (CategoricalHyperparameter, Constant)): # Checking for the correct type requires jumping some hoops # First, we gather possible interpretations of our string interpretations = [v] # type: typing.List[typing.Union[str, bool, int, float]] if v in ["True", "False"]: # Special Case for booleans (assuming we support them) # This is important to avoid false positive warnings triggered by 1 == True or "False" == True interpretations.append(True if v == 'True' else False) else: for t in [int, float]: try: interpretations.append(t(v)) except ValueError: continue # Second, check if it's in the choices / the correct type. legal = {interpretation for interpretation in interpretations if hp.is_legal(interpretation)} # Third, issue warnings if the interpretation is ambigious if len(legal) != 1: logging.getLogger("smac.trajlogger").warning( "Ambigous or no interpretation of value {} for hp {} found ({} possible interpretations). " "Passing string, but this will likely result in an error".format(v, hp.name, len(legal))) else: v = legal.pop() config_dict[k] = v config = Configuration(configuration_space=cs, values=config_dict) config.origin = "External Trajectory" return config
def add_params(cs: ConfigurationSpace): ''' adds parameters to ConfigurationSpace ''' try: selector = cs.get_hyperparameter("selector") selector.choices.append("PairwiseClassifier") except KeyError: selector = CategoricalHyperparameter( "selector", choices=["PairwiseClassifier"], default="PairwiseClassifier") cs.add_hyperparameter(selector)
def add_params(cs: ConfigurationSpace): ''' adds parameters to ConfigurationSpace ''' try: regressor = cs.get_hyperparameter("regressor") regressor.choices.append("RandomForestRegressor") regressor._num_choices += 1 except KeyError: regressor = CategoricalHyperparameter( "regressor", choices=["RandomForestRegressor"], default="RandomForestRegressor") cs.add_hyperparameter(regressor) n_estimators = UniformIntegerHyperparameter( name="rfreg:n_estimators", lower=10, upper=100, default=10, log=True) cs.add_hyperparameter(n_estimators) max_features = CategoricalHyperparameter( name="rfreg:max_features", choices=["sqrt", "log2", None], default="sqrt") cs.add_hyperparameter(max_features) max_depth = UniformIntegerHyperparameter( name="rfreg:max_depth", lower=10, upper=2 ** 31, default=2 ** 31, log=True) cs.add_hyperparameter(max_depth) min_samples_split = UniformIntegerHyperparameter( name="rfreg:min_samples_split", lower=2, upper=100, default=2, log=True) cs.add_hyperparameter(min_samples_split) min_samples_leaf = UniformIntegerHyperparameter( name="rfreg:min_samples_leaf", lower=2, upper=100, default=10, log=True) cs.add_hyperparameter(min_samples_leaf) bootstrap = CategoricalHyperparameter( name="rfreg:bootstrap", choices=[True, False], default=True) cs.add_hyperparameter(bootstrap) cond = InCondition( child=n_estimators, parent=regressor, values=["RandomForestRegressor"]) cs.add_condition(cond) cond = InCondition( child=max_features, parent=regressor, values=["RandomForestRegressor"]) cs.add_condition(cond) cond = InCondition( child=max_depth, parent=regressor, values=["RandomForestRegressor"]) cs.add_condition(cond) cond = InCondition( child=min_samples_split, parent=regressor, values=["RandomForestRegressor"]) cs.add_condition(cond) cond = InCondition( child=min_samples_leaf, parent=regressor, values=["RandomForestRegressor"]) cs.add_condition(cond) cond = InCondition( child=bootstrap, parent=regressor, values=["RandomForestRegressor"]) cs.add_condition(cond)
def get_default_initial_configs(phps: ConfigurationSpace, n_configs) -> List[Configuration]: None_name = "None:NoneType" phps = deepcopy(phps) for config in phps.get_hyperparameters(): name: str = config.name if name.startswith("preprocessing") and name.endswith( "__choice__") and (None_name in config.choices): config.default_value = None_name model_choice = phps.get_hyperparameter("estimator:__choice__") ans = [] for choice in model_choice.choices: cur_phps = deepcopy(phps) cur_phps.get_hyperparameter( "estimator:__choice__").default_value = choice default = cur_phps.get_default_configuration() ans.append(default) if len(ans) < n_configs: ans.extend(phps.sample_configuration(n_configs - len(ans))) return ans
def add_forbidden( conf_space: ConfigurationSpace, pipeline: List[Tuple[str, autoPyTorchChoice]], matches: np.ndarray, dataset_properties: Dict[str, Any], include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None ) -> ConfigurationSpace: # Not sure if this works for 3D node_i_is_choice = [] node_i_choices_names: List[List[str]] = [] node_i_choices: List[List[Union[autoPyTorchComponent, autoPyTorchChoice]]] = [] all_nodes = [] for node_name, node in pipeline: all_nodes.append(node) is_choice = hasattr(node, "get_available_components") node_i_is_choice.append(is_choice) node_include = include.get( node_name) if include is not None else None node_exclude = exclude.get( node_name) if exclude is not None else None if is_choice: node_i_choices_names.append( [str(element) for element in node.get_available_components( dataset_properties, include=node_include, exclude=node_exclude).keys()] ) node_i_choices.append( list(node.get_available_components( dataset_properties, include=node_include, exclude=node_exclude ).values())) else: node_i_choices_names.append([node_name]) node_i_choices.append([node]) # Find out all chains of choices. Only in such a chain its possible to # have several forbidden constraints choices_chains = [] idx = 0 while idx < len(pipeline): if node_i_is_choice[idx]: chain_start = idx idx += 1 while idx < len(pipeline) and node_i_is_choice[idx]: idx += 1 chain_stop = idx choices_chains.append((chain_start, chain_stop)) idx += 1 for choices_chain in choices_chains: constraints: Set[Tuple] = set() chain_start = choices_chain[0] chain_stop = choices_chain[1] chain_length = chain_stop - chain_start # Add one to have also have chain_length in the range for sub_chain_length in range(2, chain_length + 1): for start_idx in range(chain_start, chain_stop - sub_chain_length + 1): indices = range(start_idx, start_idx + sub_chain_length) node_names = [pipeline[idx][0] for idx in indices] num_node_choices = [] node_choice_names = [] skip_array_shape = [] for idx in indices: node = all_nodes[idx] available_components = node.get_available_components( dataset_properties, include=node_i_choices_names[idx]) assert len(available_components) > 0, len(available_components) skip_array_shape.append(len(available_components)) num_node_choices.append(range(len(available_components))) node_choice_names.append([name for name in available_components]) # Figure out which choices were already abandoned skip_array = np.zeros(skip_array_shape) for product in itertools.product(*num_node_choices): for node_idx, choice_idx in enumerate(product): node_idx += start_idx slices_ = tuple( slice(None) if idx != node_idx else slice(choice_idx, choice_idx + 1) for idx in range(len(matches.shape))) if np.sum(matches[slices_]) == 0: skip_array[product] = 1 for product in itertools.product(*num_node_choices): if skip_array[product]: continue slices = tuple( slice(None) if idx not in indices else slice(product[idx - start_idx], product[idx - start_idx] + 1) for idx in range(len(matches.shape))) if np.sum(matches[slices]) == 0: constraint = tuple([(node_names[i], node_choice_names[i][product[i]]) for i in range(len(product))]) # Check if a more general constraint/forbidden clause # was already added continue_ = False for constraint_length in range(2, len(constraint)): constr_starts = len(constraint) - constraint_length + 1 for constraint_start_idx in range(constr_starts): constraint_end_idx = constraint_start_idx + constraint_length sub_constraint = constraint[constraint_start_idx:constraint_end_idx] if sub_constraint in constraints: continue_ = True break if continue_: break if continue_: continue constraints.add(constraint) forbiddens = [] for i in range(len(product)): forbiddens.append( ForbiddenEqualsClause(conf_space.get_hyperparameter( node_names[i] + ":__choice__"), node_choice_names[i][product[i]])) forbidden = ForbiddenAndConjunction(*forbiddens) conf_space.add_forbidden_clause(forbidden) return conf_space
def read(pcs_string, debug=False): """ Reads in a :py:class:`~ConfigSpace.configuration_space.ConfigurationSpace` definition from a pcs file. Example ------- >>> from ConfigSpace.read_and_write import pcs_new >>> with open('configspace.pcs', 'r') as fh: >>> restored_conf = pcs_new.read(fh) Parameters ---------- pcs_string : str ConfigSpace definition in pcs format debug : bool Provides debug information. Defaults to False. Returns ------- :py:class:`~ConfigSpace.configuration_space.ConfigurationSpace` The restored ConfigurationSpace object """ configuration_space = ConfigurationSpace() conditions = [] forbidden = [] # some statistics ct = 0 cont_ct = 0 cat_ct = 0 ord_ct = 0 line_ct = 0 for line in pcs_string: line_ct += 1 if "#" in line: # It contains a comment pos = line.find("#") line = line[:pos] # Remove quotes and whitespaces at beginning and end line = line.replace('"', "").replace("'", "") line = line.strip() if "|" in line: # It's a condition try: c = pp_condition.parseString(line) conditions.append(c) except pyparsing.ParseException: raise NotImplementedError("Could not parse condition: %s" % line) continue if "}" not in line and "]" not in line: continue if line.startswith("{") and line.endswith("}"): forbidden.append(line) continue if len(line.strip()) == 0: continue ct += 1 param = None create = {"int": UniformIntegerHyperparameter, "float": UniformFloatHyperparameter, "categorical": CategoricalHyperparameter, "ordinal": OrdinalHyperparameter } try: param_list = pp_cont_param.parseString(line) name = param_list[0] if param_list[1] == 'integer': paramtype = 'int' elif param_list[1] == 'real': paramtype = 'float' else: paramtype = None if paramtype in ['int', 'float']: log = param_list[10:] param_list = param_list[:10] if len(log) > 0: log = log[0] lower = float(param_list[3]) upper = float(param_list[5]) log_on = True if "log" in log else False default_value = float(param_list[8]) param = create[paramtype](name=name, lower=lower, upper=upper, q=None, log=log_on, default_value=default_value) cont_ct += 1 except pyparsing.ParseException: pass try: if "categorical" in line: param_list = pp_cat_param.parseString(line) name = param_list[0] choices = [choice for choice in param_list[3:-4:2]] default_value = param_list[-2] param = create["categorical"](name=name, choices=choices, default_value=default_value) cat_ct += 1 elif "ordinal" in line: param_list = pp_ord_param.parseString(line) name = param_list[0] sequence = [seq for seq in param_list[3:-4:2]] default_value = param_list[-2] param = create["ordinal"](name=name, sequence=sequence, default_value=default_value) ord_ct += 1 except pyparsing.ParseException: pass if param is None: raise NotImplementedError("Could not parse: %s" % line) configuration_space.add_hyperparameter(param) for clause in forbidden: param_list = pp_forbidden_clause.parseString(clause) tmp_list = [] clause_list = [] for value in param_list[1:]: if len(tmp_list) < 3: tmp_list.append(value) else: # So far, only equals is supported by SMAC if tmp_list[1] == '=': # TODO maybe add a check if the hyperparameter is # actually in the configuration space clause_list.append(ForbiddenEqualsClause( configuration_space.get_hyperparameter(tmp_list[0]), tmp_list[2])) else: raise NotImplementedError() tmp_list = [] configuration_space.add_forbidden_clause(ForbiddenAndConjunction( *clause_list)) conditions_per_child = OrderedDict() for condition in conditions: child_name = condition[0] if child_name not in conditions_per_child: conditions_per_child[child_name] = list() conditions_per_child[child_name].append(condition) for child_name in conditions_per_child: for condition in conditions_per_child[child_name]: condition = condition[2:] condition = ' '.join(condition) if '||' in str(condition): ors = [] # 1st case we have a mixture of || and && if '&&' in str(condition): ors_combis = [] for cond_parts in str(condition).split('||'): condition = str(cond_parts).split('&&') # if length is 1 it must be or if len(condition) == 1: element_list = condition[0].split() ors_combis.append(condition_specification(child_name, element_list, configuration_space)) else: # now taking care of ands ands = [] for and_part in condition: element_list = [element for part in condition for element in and_part.split()] ands.append(condition_specification(child_name, element_list, configuration_space)) ors_combis.append(AndConjunction(*ands)) mixed_conjunction = OrConjunction(*ors_combis) configuration_space.add_condition(mixed_conjunction) else: # 2nd case: we only have ors for cond_parts in str(condition).split('||'): element_list = [element for element in cond_parts.split()] ors.append(condition_specification(child_name, element_list, configuration_space)) or_conjunction = OrConjunction(*ors) configuration_space.add_condition(or_conjunction) else: # 3rd case: we only have ands if '&&' in str(condition): ands = [] for cond_parts in str(condition).split('&&'): element_list = [element for element in cond_parts.split()] ands.append(condition_specification(child_name, element_list, configuration_space)) and_conjunction = AndConjunction(*ands) configuration_space.add_condition(and_conjunction) else: # 4th case: we have a normal condition element_list = [element for element in condition.split()] normal_condition = condition_specification(child_name, element_list, configuration_space) configuration_space.add_condition(normal_condition) return configuration_space
def _get_hyperparameter_search_space(self, include=None, exclude=None, dataset_properties=None): """Create the hyperparameter configuration space. Parameters ---------- include : dict (optional, default=None) Returns ------- """ cs = ConfigurationSpace() if dataset_properties is None or not isinstance( dataset_properties, dict): dataset_properties = dict() if not 'target_type' in dataset_properties: dataset_properties['target_type'] = 'classification' if dataset_properties['target_type'] != 'classification': dataset_properties['target_type'] = 'classification' pipeline = self.steps cs = self._get_base_search_space(cs=cs, dataset_properties=dataset_properties, exclude=exclude, include=include, pipeline=pipeline) classifiers = cs.get_hyperparameter('classifier:__choice__').choices preprocessors = cs.get_hyperparameter( 'preprocessor:__choice__').choices available_classifiers = pipeline[-1][1].get_available_components( dataset_properties) available_preprocessors = pipeline[-2][1].get_available_components( dataset_properties) possible_default_classifier = copy.copy( list(available_classifiers.keys())) default = cs.get_hyperparameter('classifier:__choice__').default del possible_default_classifier[possible_default_classifier.index( default)] # A classifier which can handle sparse data after the densifier is # forbidden for memory issues for key in classifiers: if SPARSE in available_classifiers[key].get_properties()['input']: if 'densifier' in preprocessors: while True: try: cs.add_forbidden_clause( ForbiddenAndConjunction( ForbiddenEqualsClause( cs.get_hyperparameter( 'classifier:__choice__'), key), ForbiddenEqualsClause( cs.get_hyperparameter( 'preprocessor:__choice__'), 'densifier'))) # Success break except ValueError: # Change the default and try again try: default = possible_default_classifier.pop() except IndexError: raise ValueError( "Cannot find a legal default configuration." ) cs.get_hyperparameter( 'classifier:__choice__').default = default # which would take too long # Combinations of non-linear models with feature learning: classifiers_ = [ "adaboost", "decision_tree", "extra_trees", "gradient_boosting", "k_nearest_neighbors", "libsvm_svc", "random_forest", "gaussian_nb", "decision_tree", "xgradient_boosting" ] feature_learning = ["kitchen_sinks", "nystroem_sampler"] for c, f in product(classifiers_, feature_learning): if c not in classifiers: continue if f not in preprocessors: continue while True: try: cs.add_forbidden_clause( ForbiddenAndConjunction( ForbiddenEqualsClause( cs.get_hyperparameter("classifier:__choice__"), c), ForbiddenEqualsClause( cs.get_hyperparameter( "preprocessor:__choice__"), f))) break except KeyError: break except ValueError as e: # Change the default and try again try: default = possible_default_classifier.pop() except IndexError: raise ValueError( "Cannot find a legal default configuration.") cs.get_hyperparameter( 'classifier:__choice__').default = default # Won't work # Multinomial NB etc don't use with features learning, pca etc classifiers_ = ["multinomial_nb"] preproc_with_negative_X = [ "kitchen_sinks", "pca", "truncatedSVD", "fast_ica", "kernel_pca", "nystroem_sampler" ] for c, f in product(classifiers_, preproc_with_negative_X): if c not in classifiers: continue if f not in preprocessors: continue while True: try: cs.add_forbidden_clause( ForbiddenAndConjunction( ForbiddenEqualsClause( cs.get_hyperparameter( "preprocessor:__choice__"), f), ForbiddenEqualsClause( cs.get_hyperparameter("classifier:__choice__"), c))) break except KeyError: break except ValueError: # Change the default and try again try: default = possible_default_classifier.pop() except IndexError: raise ValueError( "Cannot find a legal default configuration.") cs.get_hyperparameter( 'classifier:__choice__').default = default self.configuration_space_ = cs self.dataset_properties_ = dataset_properties return cs
def add_params(cs: ConfigurationSpace): ''' adds parameters to ConfigurationSpace ''' try: classifier = cs.get_hyperparameter("classifier") if "XGBoost" not in classifier.choices: return num_round = UniformIntegerHyperparameter(name="xgb:num_round", lower=10, upper=100, default_value=50, log=True) cs.add_hyperparameter(num_round) alpha = UniformFloatHyperparameter(name="xgb:alpha", lower=0, upper=10, default_value=1) cs.add_hyperparameter(alpha) lambda_ = UniformFloatHyperparameter(name="xgb:lambda", lower=1, upper=10, default_value=1) cs.add_hyperparameter(lambda_) colsample_bylevel = UniformFloatHyperparameter( name="xgb:colsample_bylevel", lower=0.5, upper=1, default_value=1) cs.add_hyperparameter(colsample_bylevel) colsample_bytree = UniformFloatHyperparameter( name="xgb:colsample_bytree", lower=0.5, upper=1, default_value=1) cs.add_hyperparameter(colsample_bytree) subsample = UniformFloatHyperparameter(name="xgb:subsample", lower=0.01, upper=1, default_value=1) cs.add_hyperparameter(subsample) max_delta_step = UniformFloatHyperparameter( name="xgb:max_delta_step", lower=0, upper=10, default_value=0) cs.add_hyperparameter(max_delta_step) min_child_weight = UniformFloatHyperparameter( name="xgb:min_child_weight", lower=0, upper=20, default_value=1) cs.add_hyperparameter(min_child_weight) max_depth = UniformIntegerHyperparameter(name="xgb:max_depth", lower=1, upper=10, default_value=6) cs.add_hyperparameter(max_depth) gamma = UniformFloatHyperparameter(name="xgb:gamma", lower=0, upper=10, default_value=0) cs.add_hyperparameter(gamma) eta = UniformFloatHyperparameter(name="xgb:eta", lower=0, upper=1, default_value=0.3) cs.add_hyperparameter(eta) cond = InCondition(child=num_round, parent=classifier, values=["XGBoost"]) cs.add_condition(cond) cond = InCondition(child=alpha, parent=classifier, values=["XGBoost"]) cs.add_condition(cond) cond = InCondition(child=lambda_, parent=classifier, values=["XGBoost"]) cs.add_condition(cond) cond = InCondition(child=colsample_bylevel, parent=classifier, values=["XGBoost"]) cs.add_condition(cond) cond = InCondition(child=colsample_bytree, parent=classifier, values=["XGBoost"]) cs.add_condition(cond) cond = InCondition(child=subsample, parent=classifier, values=["XGBoost"]) cs.add_condition(cond) cond = InCondition(child=max_delta_step, parent=classifier, values=["XGBoost"]) cs.add_condition(cond) cond = InCondition(child=min_child_weight, parent=classifier, values=["XGBoost"]) cs.add_condition(cond) cond = InCondition(child=max_depth, parent=classifier, values=["XGBoost"]) cs.add_condition(cond) cond = InCondition(child=gamma, parent=classifier, values=["XGBoost"]) cs.add_condition(cond) cond = InCondition(child=eta, parent=classifier, values=["XGBoost"]) cs.add_condition(cond) except: return
def read(pcs_string, debug=False): """ Read in a :py:class:`~ConfigSpace.configuration_space.ConfigurationSpace` definition from a pcs file. Example ------- .. testsetup:: pcs_new_test from ConfigSpace import ConfigurationSpace import ConfigSpace.hyperparameters as CSH from ConfigSpace.read_and_write import pcs_new cs = ConfigurationSpace() cs.add_hyperparameter(CSH.CategoricalHyperparameter('a', choices=[1, 2, 3])) with open('configspace.pcs_new', 'w') as f: f.write(pcs_new.write(cs)) .. doctest:: pcs_new_test >>> from ConfigSpace.read_and_write import pcs_new >>> with open('configspace.pcs_new', 'r') as fh: ... deserialized_conf = pcs_new.read(fh) Parameters ---------- pcs_string : str ConfigSpace definition in pcs format debug : bool Provides debug information. Defaults to False. Returns ------- :py:class:`~ConfigSpace.configuration_space.ConfigurationSpace` The deserialized ConfigurationSpace object """ configuration_space = ConfigurationSpace() conditions = [] forbidden = [] # some statistics ct = 0 cont_ct = 0 cat_ct = 0 ord_ct = 0 line_ct = 0 for line in pcs_string: line_ct += 1 if "#" in line: # It contains a comment pos = line.find("#") line = line[:pos] # Remove quotes and whitespaces at beginning and end line = line.replace('"', "").replace("'", "") line = line.strip() if "|" in line: # It's a condition try: c = pp_condition.parseString(line) conditions.append(c) except pyparsing.ParseException: raise NotImplementedError("Could not parse condition: %s" % line) continue if "}" not in line and "]" not in line: continue if line.startswith("{") and line.endswith("}"): forbidden.append(line) continue if len(line.strip()) == 0: continue ct += 1 param = None create = { "int": UniformIntegerHyperparameter, "float": UniformFloatHyperparameter, "categorical": CategoricalHyperparameter, "ordinal": OrdinalHyperparameter } try: param_list = pp_cont_param.parseString(line) name = param_list[0] if param_list[1] == 'integer': paramtype = 'int' elif param_list[1] == 'real': paramtype = 'float' else: paramtype = None if paramtype in ['int', 'float']: log = param_list[10:] param_list = param_list[:10] if len(log) > 0: log = log[0] lower = float(param_list[3]) upper = float(param_list[5]) log_on = True if "log" in log else False default_value = float(param_list[8]) param = create[paramtype](name=name, lower=lower, upper=upper, q=None, log=log_on, default_value=default_value) cont_ct += 1 except pyparsing.ParseException: pass try: if "categorical" in line: param_list = pp_cat_param.parseString(line) name = param_list[0] choices = [choice for choice in param_list[3:-4:2]] default_value = param_list[-2] param = create["categorical"]( name=name, choices=choices, default_value=default_value, ) cat_ct += 1 elif "ordinal" in line: param_list = pp_ord_param.parseString(line) name = param_list[0] sequence = [seq for seq in param_list[3:-4:2]] default_value = param_list[-2] param = create["ordinal"]( name=name, sequence=sequence, default_value=default_value, ) ord_ct += 1 except pyparsing.ParseException: pass if param is None: raise NotImplementedError("Could not parse: %s" % line) configuration_space.add_hyperparameter(param) for clause in forbidden: param_list = pp_forbidden_clause.parseString(clause) tmp_list = [] clause_list = [] for value in param_list[1:]: if len(tmp_list) < 3: tmp_list.append(value) else: # So far, only equals is supported by SMAC if tmp_list[1] == '=': hp = configuration_space.get_hyperparameter(tmp_list[0]) if isinstance(hp, NumericalHyperparameter): if isinstance(hp, IntegerHyperparameter): forbidden_value = int(tmp_list[2]) elif isinstance(hp, FloatHyperparameter): forbidden_value = float(tmp_list[2]) else: raise NotImplementedError if forbidden_value < hp.lower or forbidden_value > hp.upper: raise ValueError( f'forbidden_value is set out of the bound, it needs to' f' be set between [{hp.lower}, {hp.upper}]' f' but its value is {forbidden_value}') elif isinstance( hp, (CategoricalHyperparameter, OrdinalHyperparameter)): hp_values = hp.choices if isinstance(hp, CategoricalHyperparameter)\ else hp.sequence forbidden_value_in_hp_values = tmp_list[2] in hp_values if forbidden_value_in_hp_values: forbidden_value = tmp_list[2] else: raise ValueError( f'forbidden_value is set out of the allowed value ' f'sets, it needs to be one member from {hp_values} ' f'but its value is {forbidden_value}') else: raise ValueError('Unsupported Hyperparamter sorts') clause_list.append( ForbiddenEqualsClause( configuration_space.get_hyperparameter( tmp_list[0]), forbidden_value)) else: raise NotImplementedError() tmp_list = [] configuration_space.add_forbidden_clause( ForbiddenAndConjunction(*clause_list)) conditions_per_child = OrderedDict() for condition in conditions: child_name = condition[0] if child_name not in conditions_per_child: conditions_per_child[child_name] = list() conditions_per_child[child_name].append(condition) for child_name in conditions_per_child: for condition in conditions_per_child[child_name]: condition = condition[2:] condition = ' '.join(condition) if '||' in str(condition): ors = [] # 1st case we have a mixture of || and && if '&&' in str(condition): ors_combis = [] for cond_parts in str(condition).split('||'): condition = str(cond_parts).split('&&') # if length is 1 it must be or if len(condition) == 1: element_list = condition[0].split() ors_combis.append( condition_specification( child_name, element_list, configuration_space, )) else: # now taking care of ands ands = [] for and_part in condition: element_list = [ element for part in condition for element in and_part.split() ] ands.append( condition_specification( child_name, element_list, configuration_space, )) ors_combis.append(AndConjunction(*ands)) mixed_conjunction = OrConjunction(*ors_combis) configuration_space.add_condition(mixed_conjunction) else: # 2nd case: we only have ors for cond_parts in str(condition).split('||'): element_list = [ element for element in cond_parts.split() ] ors.append( condition_specification( child_name, element_list, configuration_space, )) or_conjunction = OrConjunction(*ors) configuration_space.add_condition(or_conjunction) else: # 3rd case: we only have ands if '&&' in str(condition): ands = [] for cond_parts in str(condition).split('&&'): element_list = [ element for element in cond_parts.split() ] ands.append( condition_specification( child_name, element_list, configuration_space, )) and_conjunction = AndConjunction(*ands) configuration_space.add_condition(and_conjunction) else: # 4th case: we have a normal condition element_list = [element for element in condition.split()] normal_condition = condition_specification( child_name, element_list, configuration_space, ) configuration_space.add_condition(normal_condition) return configuration_space
def get_hyperparameter_search_space(cls, include=None, exclude=None, dataset_properties=None): """Return the configuration space for the CASH problem. Parameters ---------- include_estimators : list of str If include_estimators is given, only the regressors specified are used. Specify them by their module name; e.g., to include only the SVM use :python:`include_regressors=['svr']`. Cannot be used together with :python:`exclude_regressors`. exclude_estimators : list of str If exclude_estimators is given, only the regressors specified are used. Specify them by their module name; e.g., to include all regressors except the SVM use :python:`exclude_regressors=['svr']`. Cannot be used together with :python:`include_regressors`. include_preprocessors : list of str If include_preprocessors is given, only the preprocessors specified are used. Specify them by their module name; e.g., to include only the PCA use :python:`include_preprocessors=['pca']`. Cannot be used together with :python:`exclude_preprocessors`. exclude_preprocessors : list of str If include_preprocessors is given, only the preprocessors specified are used. Specify them by their module name; e.g., to include all preprocessors except the PCA use :python:`exclude_preprocessors=['pca']`. Cannot be used together with :python:`include_preprocessors`. Returns ------- cs : ConfigSpace.configuration_space.Configuration The configuration space describing the SimpleRegressionClassifier. """ cs = ConfigurationSpace() if dataset_properties is None or not isinstance(dataset_properties, dict): dataset_properties = dict() if not 'target_type' in dataset_properties: dataset_properties['target_type'] = 'regression' if dataset_properties['target_type'] != 'regression': dataset_properties['target_type'] = 'regression' if 'sparse' not in dataset_properties: # This dataset is probaby dense dataset_properties['sparse'] = False pipeline = cls._get_pipeline() cs = cls._get_hyperparameter_search_space(cs, dataset_properties, exclude, include, pipeline) regressors = cs.get_hyperparameter('regressor:__choice__').choices preprocessors = cs.get_hyperparameter('preprocessor:__choice__').choices available_regressors = pipeline[-1][1].get_available_components( dataset_properties) available_preprocessors = pipeline[-2][1].get_available_components( dataset_properties) possible_default_regressor = copy.copy(list( available_regressors.keys())) default = cs.get_hyperparameter('regressor:__choice__').default del possible_default_regressor[ possible_default_regressor.index(default)] # A regressor which can handle sparse data after the densifier for key in regressors: if SPARSE in available_regressors[key].get_properties(dataset_properties=None)['input']: if 'densifier' in preprocessors: while True: try: cs.add_forbidden_clause( ForbiddenAndConjunction( ForbiddenEqualsClause( cs.get_hyperparameter( 'regressor:__choice__'), key), ForbiddenEqualsClause( cs.get_hyperparameter( 'preprocessor:__choice__'), 'densifier') )) break except ValueError: # Change the default and try again try: default = possible_default_regressor.pop() except IndexError: raise ValueError( "Cannot find a legal default configuration.") cs.get_hyperparameter( 'regressor:__choice__').default = default # which would take too long # Combinations of tree-based models with feature learning: regressors_ = ["adaboost", "decision_tree", "extra_trees", "gaussian_process", "gradient_boosting", "k_nearest_neighbors", "random_forest", "xgradient_boosting"] feature_learning_ = ["kitchen_sinks", "kernel_pca", "nystroem_sampler"] for r, f in product(regressors_, feature_learning_): if r not in regressors: continue if f not in preprocessors: continue while True: try: cs.add_forbidden_clause(ForbiddenAndConjunction( ForbiddenEqualsClause(cs.get_hyperparameter( "regressor:__choice__"), r), ForbiddenEqualsClause(cs.get_hyperparameter( "preprocessor:__choice__"), f))) break except KeyError: break except ValueError: # Change the default and try again try: default = possible_default_regressor.pop() except IndexError: raise ValueError( "Cannot find a legal default configuration.") cs.get_hyperparameter( 'regressor:__choice__').default = default return cs
def get_hyperspace(data_info, include_estimators=None, include_preprocessors=None): if data_info is None or not isinstance(data_info, dict): data_info = dict() if 'is_sparse' not in data_info: # This dataset is probaby dense data_info['is_sparse'] = False sparse = data_info['is_sparse'] task_type = data_info['task'] multilabel = (task_type == MULTILABEL_CLASSIFICATION) multiclass = (task_type == MULTICLASS_CLASSIFICATION) if task_type in CLASSIFICATION_TASKS: data_info['multilabel'] = multilabel data_info['multiclass'] = multiclass data_info['target_type'] = 'classification' pipe_type = 'classifier' # Components match to be forbidden components_ = ["adaboost", "decision_tree", "extra_trees", "gradient_boosting", "k_nearest_neighbors", "libsvm_svc", "random_forest", "gaussian_nb", "decision_tree"] feature_learning_ = ["kitchen_sinks", "nystroem_sampler"] elif task_type in REGRESSION_TASKS: data_info['target_type'] = 'regression' pipe_type = 'regressor' # Components match to be forbidden components_ = ["adaboost", "decision_tree", "extra_trees", "gaussian_process", "gradient_boosting", "k_nearest_neighbors", "random_forest"] feature_learning_ = ["kitchen_sinks", "kernel_pca", "nystroem_sampler"] else: raise NotImplementedError() include, exclude = dict(), dict() if include_preprocessors is not None: include["preprocessor"] = include_preprocessors if include_estimators is not None: include[pipe_type] = include_estimators cs = ConfigurationSpace() # Construct pipeline # FIXME OrderedDIct? pipeline = get_pipeline(data_info['task']) # TODO include, exclude, pipeline keys = [pair[0] for pair in pipeline] for key in include: if key not in keys: raise ValueError('Invalid key in include: %s; should be one ' 'of %s' % (key, keys)) for key in exclude: if key not in keys: raise ValueError('Invalid key in exclude: %s; should be one ' 'of %s' % (key, keys)) # Construct hyperspace # TODO What's the 'signed' stands for? if 'signed' not in data_info: # This dataset probably contains unsigned data data_info['signed'] = False match = check_pipeline(pipeline, data_info, include=include, exclude=exclude) # Now we have only legal combinations at this step of the pipeline # Simple sanity checks assert np.sum(match) != 0, "No valid pipeline found." assert np.sum(match) <= np.size(match), \ "'matches' is not binary; %s <= %d, %s" % \ (str(np.sum(match)), np.size(match), str(match.shape)) # Iterate each dimension of the matches array (each step of the # pipeline) to see if we can add a hyperparameter for that step for node_idx, n_ in enumerate(pipeline): node_name, node = n_ is_choice = hasattr(node, "get_available_components") # if the node isn't a choice we can add it immediately because it # must be active (if it wouldn't, np.sum(matches) would be zero if not is_choice: cs.add_configuration_space(node_name, node.get_hyperparameter_search_space(data_info)) # If the node isn't a choice, we have to figure out which of it's # choices are actually legal choices else: choices_list = find_active_choices(match, node, node_idx,data_info, include=include.get(node_name), exclude=exclude.get(node_name)) cs.add_configuration_space(node_name, node.get_hyperparameter_search_space(data_info, include=choices_list)) # And now add forbidden parameter configurations # According to matches if np.sum(match) < np.size(match): cs = add_forbidden(conf_space=cs, pipeline=pipeline, matches=match, dataset_properties=data_info, include=include, exclude=exclude) components = cs.get_hyperparameter('%s:__choice__' % pipe_type).choices availables = pipeline[-1][1].get_available_components(data_info) preprocessors = cs.get_hyperparameter('preprocessor:__choice__').choices #available_preprocessors = pipeline[-2][1].get_available_components(data_info) possible_default = copy.copy(list(availables.keys())) default = cs.get_hyperparameter('%s:__choice__' % pipe_type).default del possible_default[possible_default.index(default)] # A classifier which can handle sparse data after the densifier is # forbidden for memory issues for key in components: # TODO regression dataset_properties=None if SPARSE in availables[key].get_properties()['input']: if 'densifier' in preprocessors: while True: try: cs.add_forbidden_clause( ForbiddenAndConjunction( ForbiddenEqualsClause( cs.get_hyperparameter( '%s:__choice__' % pipe_type), key), ForbiddenEqualsClause( cs.get_hyperparameter( 'preprocessor:__choice__'), 'densifier') )) # Success break except ValueError: # Change the default and try again try: default = possible_default.pop() except IndexError: raise ValueError("Cannot find a legal default configuration.") cs.get_hyperparameter('%s:__choice__' % pipe_type).default = default # which would take too long # Combinations of non-linear models with feature learning: for c, f in itertools.product(components_, feature_learning_): if c not in components: continue if f not in preprocessors: continue while True: try: cs.add_forbidden_clause(ForbiddenAndConjunction( ForbiddenEqualsClause(cs.get_hyperparameter( "%s:__choice__" % pipe_type), c), ForbiddenEqualsClause(cs.get_hyperparameter( "preprocessor:__choice__"), f))) break except KeyError: break except ValueError as e: # Change the default and try again try: default = possible_default.pop() except IndexError: raise ValueError( "Cannot find a legal default configuration.") cs.get_hyperparameter('%s:__choice__' % pipe_type).default = default if task_type in CLASSIFICATION_TASKS: # Won't work # Multinomial NB etc don't use with features learning, pca etc components_ = ["multinomial_nb"] preproc_with_negative_X = ["kitchen_sinks", "pca", "truncatedSVD", "fast_ica", "kernel_pca", "nystroem_sampler"] for c, f in itertools.product(components_, preproc_with_negative_X): if c not in components: continue if f not in preprocessors: continue while True: try: cs.add_forbidden_clause(ForbiddenAndConjunction( ForbiddenEqualsClause(cs.get_hyperparameter( "preprocessor:__choice__"), f), ForbiddenEqualsClause(cs.get_hyperparameter( "classifier:__choice__"), c))) break except KeyError: break except ValueError: # Change the default and try again try: default = possible_default.pop() except IndexError: raise ValueError( "Cannot find a legal default configuration.") cs.get_hyperparameter('classifier:__choice__').default = default return cs
def get_hyperparameter_search_space(cls, include=None, exclude=None, dataset_properties=None): """Create the hyperparameter configuration space. Parameters ---------- include : dict (optional, default=None) Returns ------- """ cs = ConfigurationSpace() if dataset_properties is None or not isinstance(dataset_properties, dict): dataset_properties = dict() if not 'target_type' in dataset_properties: dataset_properties['target_type'] = 'classification' if dataset_properties['target_type'] != 'classification': dataset_properties['target_type'] = 'classification' pipeline = cls._get_pipeline() cs = cls._get_hyperparameter_search_space(cs, dataset_properties, exclude, include, pipeline) classifiers = cs.get_hyperparameter('classifier:__choice__').choices preprocessors = cs.get_hyperparameter('preprocessor:__choice__').choices available_classifiers = pipeline[-1][1].get_available_components( dataset_properties) available_preprocessors = pipeline[-2][1].get_available_components( dataset_properties) possible_default_classifier = copy.copy(list( available_classifiers.keys())) default = cs.get_hyperparameter('classifier:__choice__').default del possible_default_classifier[possible_default_classifier.index(default)] # A classifier which can handle sparse data after the densifier is # forbidden for memory issues for key in classifiers: if SPARSE in available_classifiers[key].get_properties()['input']: if 'densifier' in preprocessors: while True: try: cs.add_forbidden_clause( ForbiddenAndConjunction( ForbiddenEqualsClause( cs.get_hyperparameter( 'classifier:__choice__'), key), ForbiddenEqualsClause( cs.get_hyperparameter( 'preprocessor:__choice__'), 'densifier') )) # Success break except ValueError: # Change the default and try again try: default = possible_default_classifier.pop() except IndexError: raise ValueError("Cannot find a legal default configuration.") cs.get_hyperparameter( 'classifier:__choice__').default = default # which would take too long # Combinations of non-linear models with feature learning: classifiers_ = ["adaboost", "decision_tree", "extra_trees", "gradient_boosting", "k_nearest_neighbors", "libsvm_svc", "random_forest", "gaussian_nb", "decision_tree", "xgradient_boosting"] feature_learning = ["kitchen_sinks", "nystroem_sampler"] for c, f in product(classifiers_, feature_learning): if c not in classifiers: continue if f not in preprocessors: continue while True: try: cs.add_forbidden_clause(ForbiddenAndConjunction( ForbiddenEqualsClause(cs.get_hyperparameter( "classifier:__choice__"), c), ForbiddenEqualsClause(cs.get_hyperparameter( "preprocessor:__choice__"), f))) break except KeyError: break except ValueError as e: # Change the default and try again try: default = possible_default_classifier.pop() except IndexError: raise ValueError( "Cannot find a legal default configuration.") cs.get_hyperparameter( 'classifier:__choice__').default = default # Won't work # Multinomial NB etc don't use with features learning, pca etc classifiers_ = ["multinomial_nb"] preproc_with_negative_X = ["kitchen_sinks", "pca", "truncatedSVD", "fast_ica", "kernel_pca", "nystroem_sampler"] for c, f in product(classifiers_, preproc_with_negative_X): if c not in classifiers: continue if f not in preprocessors: continue while True: try: cs.add_forbidden_clause(ForbiddenAndConjunction( ForbiddenEqualsClause(cs.get_hyperparameter( "preprocessor:__choice__"), f), ForbiddenEqualsClause(cs.get_hyperparameter( "classifier:__choice__"), c))) break except KeyError: break except ValueError: # Change the default and try again try: default = possible_default_classifier.pop() except IndexError: raise ValueError( "Cannot find a legal default configuration.") cs.get_hyperparameter( 'classifier:__choice__').default = default return cs
def read(pcs_string, debug=False): """ Reads in a :py:class:`~ConfigSpace.configuration_space.ConfigurationSpace` definition from a pcs file. Example ------- >>> from ConfigSpace.read_and_write import pcs >>> with open('configspace.pcs', 'r') as fh: >>> restored_conf = pcs_new.read(fh) Parameters ---------- pcs_string : str ConfigSpace definition in pcs format debug : bool Provides debug information. Defaults to False. Returns ------- :py:class:`~ConfigSpace.configuration_space.ConfigurationSpace` The restored ConfigurationSpace object """ configuration_space = ConfigurationSpace() conditions = [] forbidden = [] # some statistics ct = 0 cont_ct = 0 cat_ct = 0 line_ct = 0 for line in pcs_string: line_ct += 1 if "#" in line: # It contains a comment pos = line.find("#") line = line[:pos] # Remove quotes and whitespaces at beginning and end line = line.replace('"', "").replace("'", "") line = line.strip() if "|" in line: # It's a condition try: c = pp_condition.parseString(line) conditions.append(c) except pyparsing.ParseException: raise NotImplementedError("Could not parse condition: %s" % line) continue if "}" not in line and "]" not in line: continue if line.startswith("{") and line.endswith("}"): forbidden.append(line) continue if len(line.strip()) == 0: continue ct += 1 param = None create = {"int": UniformIntegerHyperparameter, "float": UniformFloatHyperparameter, "categorical": CategoricalHyperparameter} try: param_list = pp_cont_param.parseString(line) il = param_list[9:] if len(il) > 0: il = il[0] param_list = param_list[:9] name = param_list[0] lower = float(param_list[2]) upper = float(param_list[4]) paramtype = "int" if "i" in il else "float" log = True if "l" in il else False default_value = float(param_list[7]) param = create[paramtype](name=name, lower=lower, upper=upper, q=None, log=log, default_value=default_value) cont_ct += 1 except pyparsing.ParseException: pass try: param_list = pp_cat_param.parseString(line) name = param_list[0] choices = [c for c in param_list[2:-4:2]] default_value = param_list[-2] param = create["categorical"](name=name, choices=choices, default_value=default_value) cat_ct += 1 except pyparsing.ParseException: pass if param is None: raise NotImplementedError("Could not parse: %s" % line) configuration_space.add_hyperparameter(param) for clause in forbidden: # TODO test this properly! # TODO Add a try/catch here! # noinspection PyUnusedLocal param_list = pp_forbidden_clause.parseString(clause) tmp_list = [] clause_list = [] for value in param_list[1:]: if len(tmp_list) < 3: tmp_list.append(value) else: # So far, only equals is supported by SMAC if tmp_list[1] == '=': # TODO maybe add a check if the hyperparameter is # actually in the configuration space clause_list.append(ForbiddenEqualsClause( configuration_space.get_hyperparameter(tmp_list[0]), tmp_list[2])) else: raise NotImplementedError() tmp_list = [] configuration_space.add_forbidden_clause(ForbiddenAndConjunction( *clause_list)) # Now handle conditions # If there are two conditions for one child, these two conditions are an # AND-conjunction of conditions, thus we have to connect them conditions_per_child = OrderedDict() for condition in conditions: child_name = condition[0] if child_name not in conditions_per_child: conditions_per_child[child_name] = list() conditions_per_child[child_name].append(condition) for child_name in conditions_per_child: condition_objects = [] for condition in conditions_per_child[child_name]: child = configuration_space.get_hyperparameter(child_name) parent_name = condition[2] parent = configuration_space.get_hyperparameter(parent_name) restrictions = condition[5:-1:2] # TODO: cast the type of the restriction! if len(restrictions) == 1: condition = EqualsCondition(child, parent, restrictions[0]) else: condition = InCondition(child, parent, values=restrictions) condition_objects.append(condition) # Now we have all condition objects for this child, so we can build a # giant AND-conjunction of them (if number of conditions >= 2)! if len(condition_objects) > 1: and_conjunction = AndConjunction(*condition_objects) configuration_space.add_condition(and_conjunction) else: configuration_space.add_condition(condition_objects[0]) return configuration_space
def _get_hyperparameter_search_space(self, include=None, exclude=None, dataset_properties=None): """Return the configuration space for the CASH problem. Parameters ---------- include_estimators : list of str If include_estimators is given, only the regressors specified are used. Specify them by their module name; e.g., to include only the SVM use :python:`include_regressors=['svr']`. Cannot be used together with :python:`exclude_regressors`. exclude_estimators : list of str If exclude_estimators is given, only the regressors specified are used. Specify them by their module name; e.g., to include all regressors except the SVM use :python:`exclude_regressors=['svr']`. Cannot be used together with :python:`include_regressors`. include_preprocessors : list of str If include_preprocessors is given, only the preprocessors specified are used. Specify them by their module name; e.g., to include only the PCA use :python:`include_preprocessors=['pca']`. Cannot be used together with :python:`exclude_preprocessors`. exclude_preprocessors : list of str If include_preprocessors is given, only the preprocessors specified are used. Specify them by their module name; e.g., to include all preprocessors except the PCA use :python:`exclude_preprocessors=['pca']`. Cannot be used together with :python:`include_preprocessors`. Returns ------- cs : ConfigSpace.configuration_space.Configuration The configuration space describing the SimpleRegressionClassifier. """ cs = ConfigurationSpace() if dataset_properties is None or not isinstance(dataset_properties, dict): dataset_properties = dict() if not 'target_type' in dataset_properties: dataset_properties['target_type'] = 'regression' if dataset_properties['target_type'] != 'regression': dataset_properties['target_type'] = 'regression' if 'sparse' not in dataset_properties: # This dataset is probaby dense dataset_properties['sparse'] = False cs = self._get_base_search_space( cs=cs, dataset_properties=dataset_properties, exclude=exclude, include=include, pipeline=self.steps) regressors = cs.get_hyperparameter('regressor:__choice__').choices preprocessors = cs.get_hyperparameter('feature_preprocessor:__choice__').choices available_regressors = self._final_estimator.get_available_components( dataset_properties) possible_default_regressor = copy.copy(list( available_regressors.keys())) default = cs.get_hyperparameter('regressor:__choice__').default_value del possible_default_regressor[ possible_default_regressor.index(default)] # A regressor which can handle sparse data after the densifier is # forbidden for memory issues for key in regressors: if SPARSE in available_regressors[key].get_properties(dataset_properties=None)['input']: if 'densifier' in preprocessors: while True: try: cs.add_forbidden_clause( ForbiddenAndConjunction( ForbiddenEqualsClause( cs.get_hyperparameter( 'regressor:__choice__'), key), ForbiddenEqualsClause( cs.get_hyperparameter( 'feature_preprocessor:__choice__'), 'densifier') )) # Success break except ValueError: # Change the default and try again try: default = possible_default_regressor.pop() except IndexError: raise ValueError( "Cannot find a legal default configuration.") cs.get_hyperparameter( 'regressor:__choice__').default_value = default # which would take too long # Combinations of tree-based models with feature learning: regressors_ = ["adaboost", "decision_tree", "extra_trees", "gaussian_process", "gradient_boosting", "k_nearest_neighbors", "random_forest", "xgradient_boosting"] feature_learning_ = ["kitchen_sinks", "kernel_pca", "nystroem_sampler"] for r, f in product(regressors_, feature_learning_): if r not in regressors: continue if f not in preprocessors: continue while True: try: cs.add_forbidden_clause(ForbiddenAndConjunction( ForbiddenEqualsClause(cs.get_hyperparameter( "regressor:__choice__"), r), ForbiddenEqualsClause(cs.get_hyperparameter( "feature_preprocessor:__choice__"), f))) break except KeyError: break except ValueError: # Change the default and try again try: default = possible_default_regressor.pop() except IndexError: raise ValueError( "Cannot find a legal default configuration.") cs.get_hyperparameter( 'regressor:__choice__').default_value = default self.configuration_space_ = cs self.dataset_properties_ = dataset_properties return cs
class AutoFolio(object): def __init__(self, random_seed: int=12345): ''' Constructor Arguments --------- random_seed: int random seed for numpy and random packages ''' np.random.seed(random_seed) # fix seed random.seed(random_seed) # I don't know the reason, but without an initial print with # logging.info we don't get any output logging.info("Init AutoFolio") self._root_logger = logging.getLogger() self.logger = logging.getLogger("AutoFolio") self.cs = None self.overwrite_args = None def run_cli(self): ''' main method of AutoFolio based on command line interface ''' cmd_parser = CMDParser() args_, self.overwrite_args = cmd_parser.parse() self._root_logger.setLevel(args_.verbose) if args_.load: self.read_model_and_predict( model_fn=args_.load, feature_vec=list(map(float, args_.feature_vec))) else: scenario = ASlibScenario() if args_.scenario: scenario.read_scenario(args_.scenario) elif args_.performance_csv and args_.feature_csv: scenario.read_from_csv(perf_fn=args_.performance_csv, feat_fn=args_.feature_csv, objective=args_.objective, runtime_cutoff=args_.runtime_cutoff, maximize=args_.maximize) self.cs = self.get_cs(scenario) if args_.tune: config = self.get_tuned_config(scenario) else: config = self.cs.get_default_configuration() self.logger.debug(config) if args_.save: feature_pre_pipeline, pre_solver, selector = self.fit( scenario=scenario, config=config) self._save_model( args_.save, scenario, feature_pre_pipeline, pre_solver, selector, config) else: self.run_cv(config=config, scenario=scenario, folds=10) def _save_model(self, out_fn: str, scenario: ASlibScenario, feature_pre_pipeline: list, pre_solver: Aspeed, selector, config: Configuration): ''' save all pipeline objects for predictions Arguments --------- out_fn: str filename of output file scenario: AslibScenario ASlib scenario with all the data feature_pre_pipeline: list list of preprocessing objects pre_solver: Aspeed aspeed object with pre-solving schedule selector: autofolio.selector.* fitted selector object config: Configuration parameter setting configuration ''' scenario.logger = None for fpp in feature_pre_pipeline: fpp.logger = None if pre_solver: pre_solver.logger = None selector.logger = None model = [scenario, feature_pre_pipeline, pre_solver, selector, config] with open(out_fn, "bw") as fp: pickle.dump(model, fp) def read_model_and_predict(self, model_fn: str, feature_vec: list): ''' reads saved model from disk and predicts the selected algorithm schedule for a given feature vector Arguments -------- model_fn: str file name of saved model feature_vec: list instance feature vector as a list of floats ''' with open(model_fn, "br") as fp: scenario, feature_pre_pipeline, pre_solver, selector, config = pickle.load( fp) for fpp in feature_pre_pipeline: fpp.logger = logging.getLogger("Feature Preprocessing") if pre_solver: pre_solver.logger = logging.getLogger("Aspeed PreSolving") selector.logger = logging.getLogger("Selector") # saved scenario is adapted to given feature vector feature_vec = np.array([feature_vec]) scenario.feature_data = pd.DataFrame( feature_vec, index=["pseudo_instance"], columns=scenario.feature_names) scenario.instances = ["pseudo_instance"] pred = self.predict(scenario=scenario, config=config, feature_pre_pipeline=feature_pre_pipeline, pre_solver=pre_solver, selector=selector) print("Selected Schedule [(algorithm, budget)]: %s" % ( pred["pseudo_instance"])) def get_cs(self, scenario: ASlibScenario): ''' returns the parameter configuration space of AutoFolio (based on the automl config space: https://github.com/automl/ConfigSpace) Arguments --------- scenario: autofolio.data.aslib_scenario.ASlibScenario aslib scenario at hand ''' self.cs = ConfigurationSpace() # add feature steps as binary parameters for fs in scenario.feature_steps: fs_param = CategoricalHyperparameter(name="fgroup_%s" % ( fs), choices=[True, False], default=fs in scenario.feature_steps_default) self.cs.add_hyperparameter(fs_param) # preprocessing PCAWrapper.add_params(self.cs) ImputerWrapper.add_params(self.cs) StandardScalerWrapper.add_params(self.cs) # Pre-Solving if scenario.performance_type[0] == "runtime": Aspeed.add_params( cs=self.cs, cutoff=scenario.algorithm_cutoff_time) # classifiers RandomForest.add_params(self.cs) # selectors PairwiseClassifier.add_params(self.cs) return self.cs def get_tuned_config(self, scenario: ASlibScenario): ''' uses SMAC3 to determine a well-performing configuration in the configuration space self.cs on the given scenario Arguments --------- scenario: ASlibScenario ASlib Scenario at hand Returns ------- Configuration best incumbent configuration found by SMAC ''' taf = ExecuteTAFunc(functools.partial(self.run_cv, scenario=scenario)) ac_scenario = Scenario({"run_obj": "quality", # we optimize quality # at most 10 function evaluations "runcount-limit": 10, "cs": self.cs, # configuration space "deterministic": "true" }) # necessary to use stats options related to scenario information AC_Stats.scenario = ac_scenario # Optimize self.logger.info( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") self.logger.info("Start Configuration") self.logger.info( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") smbo = SMBO(scenario=ac_scenario, tae_runner=taf, rng=np.random.RandomState(42)) smbo.run(max_iters=999) AC_Stats.print_stats() self.logger.info("Final Incumbent: %s" % (smbo.incumbent)) return smbo.incumbent def run_cv(self, config: Configuration, scenario: ASlibScenario, folds=10): ''' run a cross fold validation based on the given data from cv.arff Arguments --------- scenario: autofolio.data.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration to use for preprocessing folds: int number of cv-splits ''' try: if scenario.performance_type[0] == "runtime": cv_stat = Stats(runtime_cutoff=scenario.algorithm_cutoff_time) else: cv_stat = Stats(runtime_cutoff=0) for i in range(1, folds + 1): self.logger.info("CV-Iteration: %d" % (i)) test_scenario, training_scenario = scenario.get_split(indx=i) feature_pre_pipeline, pre_solver, selector = self.fit( scenario=training_scenario, config=config) schedules = self.predict( test_scenario, config, feature_pre_pipeline, pre_solver, selector) val = Validator() if scenario.performance_type[0] == "runtime": stats = val.validate_runtime( schedules=schedules, test_scenario=test_scenario) elif scenario.performance_type[0] == "solution_quality": stats = val.validate_quality( schedules=schedules, test_scenario=test_scenario) else: raise ValueError("Unknown performance_type[0]") cv_stat.merge(stat=stats) self.logger.info(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") self.logger.info("CV Stats") par10 = cv_stat.show() except ValueError: traceback.print_exc() if not scenario.maximize[0]: par10 = scenario.algorithm_cutoff_time * 10 else: par10 = scenario.algorithm_cutoff_time * -10 if scenario.maximize[0]: par10 *= -1 return par10 def fit(self, scenario: ASlibScenario, config: Configuration): ''' fit AutoFolio on given ASlib Scenario Arguments --------- scenario: autofolio.data.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration to use for preprocessing Returns ------- list of fitted feature preproccessing objects pre-solving object fitted selector ''' self.logger.info("Given Configuration: %s" % (config)) if self.overwrite_args: config = self._overwrite_configuration( config=config, overwrite_args=self.overwrite_args) self.logger.info("Overwritten Configuration: %s" % (config)) scenario, feature_pre_pipeline = self.fit_transform_feature_preprocessing( scenario, config) pre_solver = self.fit_pre_solving(scenario, config) selector = self.fit_selector(scenario, config) return feature_pre_pipeline, pre_solver, selector def _overwrite_configuration(self, config: Configuration, overwrite_args: list): ''' overwrites a given configuration with some new settings Arguments --------- config: Configuration initial configuration to be adapted overwrite_args: list new parameter settings as a list of strings Returns ------- Configuration ''' def pairwise(iterable): a, b = tee(iterable) next(b, None) return zip(a, b) dict_conf = config.get_dictionary() for param, value in pairwise(overwrite_args): if dict_conf.get(param): if type(self.cs.get_hyperparameter(param)) is UniformIntegerHyperparameter: dict_conf[param] = int(value) elif type(self.cs.get_hyperparameter(param)) is UniformFloatHyperparameter: dict_conf[param] = float(value) elif value == "True": dict_conf[param] = True elif value == "False": dict_conf[param] = False else: dict_conf[param] = value else: self.logger.warn( "Unknown given parameter: %s %s" % (param, value)) config = Configuration(self.cs, values=dict_conf) return config def fit_transform_feature_preprocessing(self, scenario: ASlibScenario, config: Configuration): ''' performs feature preprocessing on a given ASlib scenario wrt to a given configuration Arguments --------- scenario: autofolio.data.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration to use for preprocessing Returns ------- list of fitted feature preproccessing objects ''' pipeline = [] fgf = FeatureGroupFiltering() scenario = fgf.fit_transform(scenario, config) imputer = ImputerWrapper() scenario = imputer.fit_transform(scenario, config) scaler = StandardScalerWrapper() scenario = scaler.fit_transform(scenario, config) pca = PCAWrapper() scenario = pca.fit_transform(scenario, config) return scenario, [fgf, imputer, scaler, pca] def fit_pre_solving(self, scenario: ASlibScenario, config: Configuration): ''' fits an pre-solving schedule using Aspeed [Hoos et al, 2015 TPLP) Arguments --------- scenario: autofolio.data.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration to use for preprocessing Returns ------- instance of Aspeed() with a fitted pre-solving schedule if performance_type of scenario is runtime; else None ''' if scenario.performance_type[0] == "runtime": aspeed = Aspeed() aspeed.fit(scenario=scenario, config=config) return aspeed else: return None def fit_selector(self, scenario: ASlibScenario, config: Configuration): ''' fits an algorithm selector for a given scenario wrt a given configuration Arguments --------- scenario: autofolio.data.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration ''' if config.get("selector") == "PairwiseClassifier": clf_class = None if config.get("classifier") == "RandomForest": clf_class = RandomForest selector = PairwiseClassifier(classifier_class=clf_class) selector.fit(scenario=scenario, config=config) return selector def predict(self, scenario: ASlibScenario, config: Configuration, feature_pre_pipeline: list, pre_solver: Aspeed, selector): ''' predicts algorithm schedules wrt a given config and given pipelines Arguments --------- scenario: autofolio.data.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration feature_pre_pipeline: list list of fitted feature preprocessors pre_solver: Aspeed pre solver object with a saved static schedule selector: autofolio.selector.* fitted selector object ''' self.logger.info("Predict on Test") for f_pre in feature_pre_pipeline: scenario = f_pre.transform(scenario) if pre_solver: pre_solving_schedule = pre_solver.predict(scenario=scenario) else: pre_solving_schedule = {} pred_schedules = selector.predict(scenario=scenario) # combine schedules if pre_solving_schedule: return dict((inst, pre_solving_schedule.get(inst, []) + schedule) for inst, schedule in pred_schedules.items()) else: return pred_schedules
class AutoFolio(object): def __init__(self, random_seed: int=12345): ''' Constructor Arguments --------- random_seed: int random seed for numpy and random packages ''' np.random.seed(random_seed) # fix seed random.seed(random_seed) # I don't know the reason, but without an initial print with # logging.info we don't get any output logging.info("Init AutoFolio") self._root_logger = logging.getLogger() self.logger = logging.getLogger("AutoFolio") self.cs = None self.overwrite_args = None def run_cli(self): ''' main method of AutoFolio based on command line interface ''' cmd_parser = CMDParser() args_, self.overwrite_args = cmd_parser.parse() self._root_logger.setLevel(args_.verbose) if args_.load: pred = self.read_model_and_predict( model_fn=args_.load, feature_vec=list(map(float, args_.feature_vec.split(" ")))) print("Selected Schedule [(algorithm, budget)]: %s" % (pred)) else: scenario = ASlibScenario() if args_.scenario: scenario.read_scenario(args_.scenario) elif args_.performance_csv and args_.feature_csv: scenario.read_from_csv(perf_fn=args_.performance_csv, feat_fn=args_.feature_csv, objective=args_.objective, runtime_cutoff=args_.runtime_cutoff, maximize=args_.maximize, cv_fn=args_.cv_csv) else: raise ValueError("Missing inputs to read scenario data.") test_scenario = None if args_.performance_test_csv and args_.feature_test_csv: test_scenario = ASlibScenario() test_scenario.read_from_csv(perf_fn=args_.performance_test_csv, feat_fn=args_.feature_test_csv, objective=args_.objective, runtime_cutoff=args_.runtime_cutoff, maximize=args_.maximize, cv_fn=None) config = {} if args_.config is not None: self.logger.info("Reading yaml config file") config = yaml.load(open(args_.config)) if not config.get("wallclock_limit"): config["wallclock_limit"] = args_.wallclock_limit if not config.get("runcount_limit"): config["runcount_limit"] = args_.runcount_limit if not config.get("output-dir"): config["output-dir"] = args_.output_dir self.cs = self.get_cs(scenario, config) if args_.outer_cv: self._outer_cv(scenario, config, args_.outer_cv_fold, args_.out_template, smac_seed=args_.smac_seed) return 0 if args_.tune: config = self.get_tuned_config(scenario, wallclock_limit=args_.wallclock_limit, runcount_limit=args_.runcount_limit, autofolio_config=config, seed=args_.smac_seed) else: config = self.cs.get_default_configuration() self.logger.debug(config) if args_.save: feature_pre_pipeline, pre_solver, selector = self.fit( scenario=scenario, config=config) self._save_model( args_.save, scenario, feature_pre_pipeline, pre_solver, selector, config) else: self.run_cv(config=config, scenario=scenario, folds=int(scenario.cv_data.max().max())) if test_scenario is not None: stats = self.run_fold(config=config, fold=0, return_fit=False, scenario=scenario, test_scenario=test_scenario) def _outer_cv(self, scenario: ASlibScenario, autofolio_config:dict=None, outer_cv_fold:int=None, out_template:str=None, smac_seed:int=42): ''' Evaluate on a scenario using an "outer" cross-fold validation scheme. In particular, this ensures that SMAC does not use the test set during hyperparameter optimization. Arguments --------- scenario: ASlibScenario ASlib Scenario at hand autofolio_config: dict, or None An optional dictionary of configuration options outer_cv_fold: int, or None If given, then only the single outer-cv fold is processed out_template: str, or None If given, the learned configurations are written to the specified locations. The string is considered a template, and "%fold%" will be replaced with the fold. smac_seed:int random seed for SMAC Returns ------- stats: validate.Stats Performance over all outer-cv folds ''' import string outer_stats = None # For each outer split outer_cv_folds = range(1, 11) if outer_cv_fold is not None: outer_cv_folds = range(outer_cv_fold, outer_cv_fold+1) for cv_fold in outer_cv_folds: # Use ‘ASlibScenario.get_split()’ to get the outer split outer_testing, outer_training = scenario.get_split(cv_fold) msg = ">>>>> Outer CV fold: {} <<<<<".format(cv_fold) self.logger.info(msg) # Use ASlibScenario.create_cv_splits() to get an inner-cv outer_training.create_cv_splits(n_folds=10) # Use ‘AutoFolio.get_tuned_config()’ to tune on inner-cv config = self.get_tuned_config( outer_training, autofolio_config=autofolio_config, seed=smac_seed ) # Use `AutoFolio.run_fold()’ to get the performance on the outer split stats, fit, schedule = self.run_fold( config, scenario, cv_fold, return_fit=True ) feature_pre_pipeline, pre_solver, selector = fit if outer_stats is None: outer_stats = stats else: outer_stats.merge(stats) # save the model, if given an output location if out_template is not None: out_template_ = string.Template(out_template) model_fn = out_template_.substitute(fold=cv_fold, type="pkl") msg = "Writing model to: {}".format(model_fn) self.logger.info(msg) self._save_model( model_fn, scenario, feature_pre_pipeline, pre_solver, selector, config ) # convert the schedule to a data frame schedule_df = pd.Series(schedule, name="solver") schedule_df.index.name = "instance" schedule_df = schedule_df.reset_index() # just keep the solver name; we don't care about the time # x[0] gets the first pair in the schedule list # and x[0][0] gets the name of the solver from that pair schedule_df['solver'] = schedule_df['solver'].apply(lambda x: x[0][0]) selections_fn = out_template_.substitute(fold=cv_fold, type="csv") msg = "Writing solver choices to: {}".format(selections_fn) self.logger.info(msg) schedule_df.to_csv(selections_fn, index=False) self.logger.info(">>>>> Final Stats <<<<<") outer_stats.show() def _save_model(self, out_fn: str, scenario: ASlibScenario, feature_pre_pipeline: list, pre_solver: Aspeed, selector, config: Configuration): ''' save all pipeline objects for predictions Arguments --------- out_fn: str filename of output file scenario: AslibScenario ASlib scenario with all the data feature_pre_pipeline: list list of preprocessing objects pre_solver: Aspeed aspeed object with pre-solving schedule selector: autofolio.selector.* fitted selector object config: Configuration parameter setting configuration ''' scenario.logger = None for fpp in feature_pre_pipeline: fpp.logger = None if pre_solver: pre_solver.logger = None selector.logger = None model = [scenario, feature_pre_pipeline, pre_solver, selector, config] with open(out_fn, "bw") as fp: pickle.dump(model, fp) def read_model_and_predict(self, model_fn: str, feature_vec: list): ''' reads saved model from disk and predicts the selected algorithm schedule for a given feature vector Arguments -------- model_fn: str file name of saved model feature_vec: list instance feature vector as a list of floats Returns ------- list of tuple Selected schedule [(algorithm, budget)] ''' with open(model_fn, "br") as fp: scenario, feature_pre_pipeline, pre_solver, selector, config = pickle.load( fp) for fpp in feature_pre_pipeline: fpp.logger = logging.getLogger("Feature Preprocessing") if pre_solver: pre_solver.logger = logging.getLogger("Aspeed PreSolving") selector.logger = logging.getLogger("Selector") # saved scenario is adapted to given feature vector feature_vec = np.array([feature_vec]) scenario.feature_data = pd.DataFrame( feature_vec, index=["pseudo_instance"], columns=scenario.features) scenario.instances = ["pseudo_instance"] pred = self.predict(scenario=scenario, config=config, feature_pre_pipeline=feature_pre_pipeline, pre_solver=pre_solver, selector=selector) return pred["pseudo_instance"] def get_cs(self, scenario: ASlibScenario, autofolio_config:dict=None): ''' returns the parameter configuration space of AutoFolio (based on the automl config space: https://github.com/automl/ConfigSpace) Arguments --------- scenario: aslib_scenario.aslib_scenario.ASlibScenario aslib scenario at hand autofolio_config: dict, or None An optional dictionary of configuration options ''' self.cs = ConfigurationSpace() # only allow the feature groups specified in the config file # by default, though, all of the feature groups are allowed. allowed_feature_groups = autofolio_config.get("allowed_feature_groups", scenario.feature_steps) if len(allowed_feature_groups) == 0: msg = "Please ensure at least one feature group is allowed" raise ValueError(msg) if len(allowed_feature_groups) == 1: choices = [True] # if we only have one feature group, it has to be active else: choices = [True, False] default = True for fs in allowed_feature_groups: fs_param = CategoricalHyperparameter(name="fgroup_%s" % (fs), choices=choices, default_value=default) self.cs.add_hyperparameter(fs_param) # preprocessing if autofolio_config.get("pca", True): PCAWrapper.add_params(self.cs) if autofolio_config.get("impute", True): ImputerWrapper.add_params(self.cs) if autofolio_config.get("scale", True): StandardScalerWrapper.add_params(self.cs) # Pre-Solving if scenario.performance_type[0] == "runtime": if autofolio_config.get("presolve", True): Aspeed.add_params( cs=self.cs, cutoff=scenario.algorithm_cutoff_time) if autofolio_config.get("classifier"): # fix parameter cls_choices = [autofolio_config["classifier"]] cls_def = autofolio_config["classifier"] else: cls_choices = ["RandomForest","XGBoost"] cls_def = "RandomForest" classifier = CategoricalHyperparameter( "classifier", choices=cls_choices, default_value=cls_def) self.cs.add_hyperparameter(classifier) RandomForest.add_params(self.cs) XGBoost.add_params(self.cs) if autofolio_config.get("regressor"): # fix parameter reg_choices = [autofolio_config["regressor"]] reg_def = autofolio_config["regressor"] else: reg_choices = ["RandomForestRegressor"] reg_def = "RandomForestRegressor" regressor = CategoricalHyperparameter( "regressor", choices=reg_choices, default_value=reg_def) self.cs.add_hyperparameter(regressor) RandomForestRegressor.add_params(self.cs) # selectors if autofolio_config.get("selector"): # fix parameter sel_choices = [autofolio_config["selector"]] sel_def = autofolio_config["selector"] else: sel_choices = ["PairwiseClassifier","PairwiseRegressor"] sel_def = "PairwiseClassifier" selector = CategoricalHyperparameter( "selector", choices=sel_choices, default_value=sel_def) self.cs.add_hyperparameter(selector) PairwiseClassifier.add_params(self.cs) PairwiseRegression.add_params(self.cs) self.logger.debug(self.cs) return self.cs def get_tuned_config(self, scenario: ASlibScenario, runcount_limit:int=42, wallclock_limit:int=300, autofolio_config:dict=dict(), seed:int=42): ''' uses SMAC3 to determine a well-performing configuration in the configuration space self.cs on the given scenario Arguments --------- scenario: ASlibScenario ASlib Scenario at hand runcount_limit: int runcount_limit for SMAC scenario wallclock_limit: int wallclock limit in sec for SMAC scenario (overwritten by autofolio_config) autofolio_config: dict, or None An optional dictionary of configuration options seed: int random seed for SMAC Returns ------- Configuration best incumbent configuration found by SMAC ''' wallclock_limit = autofolio_config.get("wallclock_limit", wallclock_limit) runcount_limit = autofolio_config.get("runcount_limit", runcount_limit) taf = functools.partial(self.called_by_smac, scenario=scenario) max_fold = scenario.cv_data.max().max() max_fold = int(max_fold) ac_scenario = Scenario({"run_obj": "quality", # we optimize quality "runcount-limit": runcount_limit, "cs": self.cs, # configuration space "deterministic": "true", "instances": [[str(i)] for i in range(1, max_fold+1)], "wallclock-limit": wallclock_limit, "output-dir" : "" if not autofolio_config.get("output-dir",None) else autofolio_config.get("output-dir") }) # necessary to use stats options related to scenario information AC_Stats.scenario = ac_scenario # Optimize self.logger.info( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") self.logger.info("Start Configuration") self.logger.info( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") smac = SMAC(scenario=ac_scenario, tae_runner=taf, rng=np.random.RandomState(seed)) incumbent = smac.optimize() self.logger.info("Final Incumbent: %s" % (incumbent)) return incumbent def called_by_smac(self, config: Configuration, scenario: ASlibScenario, instance:str=None, seed:int=1): ''' run a cross fold validation based on the given data from cv.arff Arguments --------- config: Configuration parameter configuration to use for preprocessing scenario: aslib_scenario.aslib_scenario.ASlibScenario aslib scenario at hand instance: str cv-fold index seed: int random seed (not used) Returns ------- float: average performance ''' if instance is None: perf = self.run_cv(config=config, scenario=scenario) else: try: stats = self.run_fold(config=config, scenario=scenario, fold=int(instance)) perf = stats.show() except ValueError: if scenario.performance_type[0] == "runtime": perf = scenario.algorithm_cutoff_time * 20 else: # try to impute a worst case perf perf = scenario.performance_data.max().max() if scenario.maximize[0]: perf *= -1 return perf def run_cv(self, config: Configuration, scenario: ASlibScenario, folds:int=10): ''' run a cross fold validation based on the given data from cv.arff Arguments --------- scenario: aslib_scenario.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration to use for preprocessing folds: int number of cv-splits seed: int random seed (not used) ''' #TODO: use seed and instance in an appropriate way try: if scenario.performance_type[0] == "runtime": cv_stat = Stats(runtime_cutoff=scenario.algorithm_cutoff_time) else: cv_stat = Stats(runtime_cutoff=0) for i in range(1, folds + 1): self.logger.info("CV-Iteration: %d" % (i)) stats = self.run_fold(config=config, scenario=scenario, fold=i) cv_stat.merge(stat=stats) self.logger.info(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") self.logger.info("CV Stats") par10 = cv_stat.show() except ValueError: traceback.print_exc() par10 = scenario.algorithm_cutoff_time * 10 if scenario.maximize[0]: par10 *= -1 return par10 def run_fold(self, config: Configuration, scenario:ASlibScenario, fold:int, test_scenario=None, return_fit:bool=False): ''' run a given fold of cross validation Arguments --------- scenario: aslib_scenario.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration to use for preprocessing fold: int fold id test_scenario:aslib_scenario.aslib_scenario.ASlibScenario aslib scenario with test data for validation generated from <scenario> if None return_fit: bool optionally, the learned preprocessing options, presolver and selector can be returned Returns ------- Stats() (pre_pipeline, pre_solver, selector): only present if return_fit is True the pipeline components fit with the configuration options schedule: dict of string -> list of (solver, cutoff) pairs only present if return_fit is True the solver choices for each instance ''' if test_scenario is None: self.logger.info("CV-Iteration: %d" % (fold)) test_scenario, training_scenario = scenario.get_split(indx=fold) else: self.logger.info("Validation on test data") training_scenario = scenario feature_pre_pipeline, pre_solver, selector = self.fit( scenario=training_scenario, config=config) schedules = self.predict( test_scenario, config, feature_pre_pipeline, pre_solver, selector) val = Validator() if scenario.performance_type[0] == "runtime": stats = val.validate_runtime( schedules=schedules, test_scenario=test_scenario, train_scenario=training_scenario) elif scenario.performance_type[0] == "solution_quality": stats = val.validate_quality( schedules=schedules, test_scenario=test_scenario, train_scenario=training_scenario) else: raise ValueError("Unknown: %s" %(scenario.performance_type[0])) if return_fit: return stats, (feature_pre_pipeline, pre_solver, selector), schedules else: return stats def fit(self, scenario: ASlibScenario, config: Configuration): ''' fit AutoFolio on given ASlib Scenario Arguments --------- scenario: aslib_scenario.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration to use for preprocessing Returns ------- list of fitted feature preproccessing objects pre-solving object fitted selector ''' self.logger.info("Given Configuration: %s" % (config)) if self.overwrite_args: config = self._overwrite_configuration( config=config, overwrite_args=self.overwrite_args) self.logger.info("Overwritten Configuration: %s" % (config)) scenario, feature_pre_pipeline = self.fit_transform_feature_preprocessing( scenario, config) pre_solver = self.fit_pre_solving(scenario, config) selector = self.fit_selector(scenario, config) return feature_pre_pipeline, pre_solver, selector def _overwrite_configuration(self, config: Configuration, overwrite_args: list): ''' overwrites a given configuration with some new settings Arguments --------- config: Configuration initial configuration to be adapted overwrite_args: list new parameter settings as a list of strings Returns ------- Configuration ''' def pairwise(iterable): a, b = tee(iterable) next(b, None) return zip(a, b) dict_conf = config.get_dictionary() for param, value in pairwise(overwrite_args): try: ok = self.cs.get_hyperparameter(param) except KeyError: ok = None if ok is not None: if type(self.cs.get_hyperparameter(param)) is UniformIntegerHyperparameter: dict_conf[param] = int(value) elif type(self.cs.get_hyperparameter(param)) is UniformFloatHyperparameter: dict_conf[param] = float(value) elif value == "True": dict_conf[param] = True elif value == "False": dict_conf[param] = False else: dict_conf[param] = value else: self.logger.warn( "Unknown given parameter: %s %s" % (param, value)) config = Configuration(self.cs, values=dict_conf, allow_inactive_with_values=True) return config def fit_transform_feature_preprocessing(self, scenario: ASlibScenario, config: Configuration): ''' performs feature preprocessing on a given ASlib scenario wrt to a given configuration Arguments --------- scenario: aslib_scenario.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration to use for preprocessing Returns ------- list of fitted feature preproccessing objects ''' pipeline = [] fgf = FeatureGroupFiltering() scenario = fgf.fit_transform(scenario, config) imputer = ImputerWrapper() scenario = imputer.fit_transform(scenario, config) scaler = StandardScalerWrapper() scenario = scaler.fit_transform(scenario, config) pca = PCAWrapper() scenario = pca.fit_transform(scenario, config) return scenario, [fgf, imputer, scaler, pca] def fit_pre_solving(self, scenario: ASlibScenario, config: Configuration): ''' fits an pre-solving schedule using Aspeed [Hoos et al, 2015 TPLP) Arguments --------- scenario: aslib_scenario.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration to use for preprocessing Returns ------- instance of Aspeed() with a fitted pre-solving schedule if performance_type of scenario is runtime; else None ''' if scenario.performance_type[0] == "runtime": aspeed = Aspeed() aspeed.fit(scenario=scenario, config=config) return aspeed else: return None def fit_selector(self, scenario: ASlibScenario, config: Configuration): ''' fits an algorithm selector for a given scenario wrt a given configuration Arguments --------- scenario: aslib_scenario.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration ''' if config.get("selector") == "PairwiseClassifier": clf_class = None if config.get("classifier") == "RandomForest": clf_class = RandomForest if config.get("classifier") == "XGBoost": clf_class = XGBoost selector = PairwiseClassifier(classifier_class=clf_class) selector.fit(scenario=scenario, config=config) if config.get("selector") == "MultiClassifier": clf_class = None if config.get("classifier") == "RandomForest": clf_class = RandomForest if config.get("classifier") == "XGBoost": clf_class = XGBoost selector = MultiClassifier(classifier_class=clf_class) selector.fit(scenario=scenario, config=config) if config.get("selector") == "IndRegressor": reg_class = None if config.get("regressor") == "RandomForestRegressor": reg_class = RandomForestRegressor selector = IndRegression(regressor_class=reg_class) selector.fit(scenario=scenario, config=config) if config.get("selector") == "JointRegressor": reg_class = None if config.get("regressor") == "RandomForestRegressor": reg_class = RandomForestRegressor selector = JointRegression(regressor_class=reg_class) selector.fit(scenario=scenario, config=config) if config.get("selector") == "PairwiseRegressor": reg_class = None if config.get("regressor") == "RandomForestRegressor": reg_class = RandomForestRegressor selector = PairwiseRegression(regressor_class=reg_class) selector.fit(scenario=scenario, config=config) return selector def predict(self, scenario: ASlibScenario, config: Configuration, feature_pre_pipeline: list, pre_solver: Aspeed, selector): ''' predicts algorithm schedules wrt a given config and given pipelines Arguments --------- scenario: aslib_scenario.aslib_scenario.ASlibScenario aslib scenario at hand config: Configuration parameter configuration feature_pre_pipeline: list list of fitted feature preprocessors pre_solver: Aspeed pre solver object with a saved static schedule selector: autofolio.selector.* fitted selector object ''' self.logger.info("Predict on Test") for f_pre in feature_pre_pipeline: scenario = f_pre.transform(scenario) if pre_solver: pre_solving_schedule = pre_solver.predict(scenario=scenario) else: pre_solving_schedule = {} pred_schedules = selector.predict(scenario=scenario) # combine schedules if pre_solving_schedule: return dict((inst, pre_solving_schedule.get(inst, []) + schedule) for inst, schedule in pred_schedules.items()) else: return pred_schedules
from copy import copy, deepcopy from pickle import dumps, loads from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter weights = [0.25, 0.5, 0.25] hp = CategoricalHyperparameter("B", ["1", "2", "3"], weights=weights) sub_cs = ConfigurationSpace() sub_cs.add_hyperparameter(hp) cs = ConfigurationSpace() cs.add_configuration_space("A", sub_cs) print(deepcopy(sub_cs).get_hyperparameter("B").probabilities, weights) print(copy(sub_cs).get_hyperparameter("B").probabilities, weights) print(loads(dumps(sub_cs)).get_hyperparameter("B").probabilities, weights) print(cs.get_hyperparameter("A:B").probabilities, weights) print(deepcopy(cs).get_hyperparameter("A:B").probabilities, weights) print(copy(cs).get_hyperparameter("A:B").probabilities, weights) print(loads(dumps(cs)).get_hyperparameter("A:B").probabilities, weights)
def generate_csv_data(NUM_EVALUATIONS, NUM_BUDGETS, ALLINONE, SEPARATE): if not os.path.exists(ALLINONE): os.makedirs(ALLINONE) if not os.path.exists(SEPARATE): os.makedirs(SEPARATE) config_space = ConfigurationSpace() config_space.add_hyperparameters([UniformFloatHyperparameter('random_parameter_1', 0, 1.2), UniformIntegerHyperparameter('random_parameter_2', -10, 10), UniformIntegerHyperparameter('random_parameter_3', 1, 1000)]) trajectory = [] runhistory = [] lowest_cost = np.inf start_time = time.time() if NUM_BUDGETS <= 1: budgets = [0 for _ in range(NUM_EVALUATIONS)] else: budgets = [50 + 50 * (i // (NUM_EVALUATIONS / NUM_BUDGETS)) for i in range(NUM_EVALUATIONS)] for i, budget in enumerate(budgets): if i == 0: random1 = config_space.get_hyperparameter('random_parameter_1').default_value random2 = config_space.get_hyperparameter('random_parameter_2').default_value random3 = config_space.get_hyperparameter('random_parameter_3').default_value else: random1 = np.random.uniform(0.1, 1.1) random2 = np.random.randint(-10, 10) random3 = np.random.randint(1, 1000) cost = np.random.uniform(np.abs(NUM_EVALUATIONS - i - np.random.randint(50)), 10 * np.log(NUM_EVALUATIONS - i)) * random1 new_time = time.time() - start_time status = 'SUCCESS' seed = 42 # should be: np.random.randint(1, 10000000) but seeds are currently not supported with budgets. if lowest_cost > cost: lowest_cost = cost trajectory.append([new_time, new_time, i, cost, random1, random2, random3]) runhistory.append([cost, new_time, status, budget, seed, random1, random2, random3]) with open(os.path.join(ALLINONE, 'runhistory.csv'), 'w', newline='') as f: writer = csv.writer(f, delimiter=',') writer.writerow(['cost', 'time', 'status', 'budget', 'seed', 'random_parameter_1', 'random_parameter_2', 'random_parameter_3']) for run in runhistory: writer.writerow(run) with open(os.path.join(SEPARATE, 'runhistory.csv'), 'w', newline='') as rh,\ open(os.path.join(SEPARATE, 'configurations.csv'), 'w', newline='') as configs: rh_writer = csv.writer(rh, delimiter=',') configs_writer = csv.writer(configs, delimiter=',') rh_writer.writerow(['cost', 'time', 'status', 'budget', 'seed', 'config_id']) configs_writer.writerow(['CONFIG_ID', 'random_parameter_1', 'random_parameter_2', 'random_parameter_3']) for idx, run in enumerate(runhistory): rh_writer.writerow(run[:5] + [idx]) configs_writer.writerow([idx] + run[5:]) for path in [ALLINONE, SEPARATE]: with open(os.path.join(path, 'configspace.json'), 'w') as f: f.write(pcs_json.write(config_space)) with open(os.path.join(path, 'trajectory.csv'), 'w', newline='') as f: writer = csv.writer(f, delimiter=',') writer.writerow(['cpu_time', 'wallclock_time', 'evaluations', 'cost', 'random_parameter_1', 'random_parameter_2', 'random_parameter_3']) for t in trajectory: writer.writerow(t) with open(os.path.join(path, 'scenario.txt'), 'w' ) as f: f.write('paramfile = {}\nrun_obj = quality'.format(os.path.join(os.path.basename(path.rstrip('/')), 'configspace.json')))
def read(pcs_string, debug=False): configuration_space = ConfigurationSpace() conditions = [] forbidden = [] # some statistics ct = 0 cont_ct = 0 cat_ct = 0 ord_ct = 0 line_ct = 0 for line in pcs_string: line_ct += 1 if "#" in line: # It contains a comment pos = line.find("#") line = line[:pos] # Remove quotes and whitespaces at beginning and end line = line.replace('"', "").replace("'", "") line = line.strip() if "|" in line: # It's a condition try: c = pp_condition.parseString(line) conditions.append(c) except pyparsing.ParseException: raise NotImplementedError("Could not parse condition: %s" % line) continue if "}" not in line and "]" not in line: continue if line.startswith("{") and line.endswith("}"): forbidden.append(line) continue if len(line.strip()) == 0: continue ct += 1 param = None create = {"int": UniformIntegerHyperparameter, "float": UniformFloatHyperparameter, "categorical": CategoricalHyperparameter, "ordinal": OrdinalHyperparameter } try: param_list = pp_cont_param.parseString(line) name = param_list[0] if param_list[1] == 'integer': paramtype = 'int' elif param_list[1] == 'real': paramtype = 'float' else: paramtype = None if paramtype in ['int', 'float']: log = param_list[10:] param_list = param_list[:10] if len(log) > 0: log = log[0] lower = float(param_list[3]) upper = float(param_list[5]) log_on = True if "log" in log else False default = float(param_list[8]) param = create[paramtype](name=name, lower=lower, upper=upper, q=None, log=log_on, default=default) cont_ct += 1 except pyparsing.ParseException: pass try: if "categorical" in line: param_list = pp_cat_param.parseString(line) name = param_list[0] choices = [choice for choice in param_list[3:-4:2]] default = param_list[-2] param = create["categorical"](name=name, choices=choices, default=default) cat_ct += 1 elif "ordinal" in line: param_list = pp_ord_param.parseString(line) name = param_list[0] sequence = [seq for seq in param_list[3:-4:2]] default = param_list[-2] param = create["ordinal"](name=name, sequence=sequence, default=default) ord_ct += 1 except pyparsing.ParseException: pass if param is None: raise NotImplementedError("Could not parse: %s" % line) configuration_space.add_hyperparameter(param) for clause in forbidden: param_list = pp_forbidden_clause.parseString(clause) tmp_list = [] clause_list = [] for value in param_list[1:]: if len(tmp_list) < 3: tmp_list.append(value) else: # So far, only equals is supported by SMAC if tmp_list[1] == '=': # TODO maybe add a check if the hyperparameter is # actually in the configuration space clause_list.append(ForbiddenEqualsClause( configuration_space.get_hyperparameter(tmp_list[0]), tmp_list[2])) else: raise NotImplementedError() tmp_list = [] configuration_space.add_forbidden_clause(ForbiddenAndConjunction( *clause_list)) conditions_per_child = OrderedDict() for condition in conditions: child_name = condition[0] if child_name not in conditions_per_child: conditions_per_child[child_name] = list() conditions_per_child[child_name].append(condition) for child_name in conditions_per_child: for condition in conditions_per_child[child_name]: condition = condition[2:] condition = ' '.join(condition) if '||' in str(condition): ors = [] # 1st case we have a mixture of || and && if '&&' in str(condition): ors_combis = [] for cond_parts in str(condition).split('||'): condition = str(cond_parts).split('&&') # if length is 1 it must be or if len(condition) == 1: element_list = condition[0].split() ors_combis.append(condition_specification(child_name, element_list, configuration_space)) else: # now taking care of ands ands = [] for and_part in condition: element_list = [element for part in condition for element in and_part.split()] ands.append(condition_specification(child_name, element_list, configuration_space)) ors_combis.append(AndConjunction(*ands)) mixed_conjunction = OrConjunction(*ors_combis) configuration_space.add_condition(mixed_conjunction) else: # 2nd case: we only have ors for cond_parts in str(condition).split('||'): element_list = [element for element in cond_parts.split()] ors.append(condition_specification(child_name, element_list, configuration_space)) or_conjunction = OrConjunction(*ors) configuration_space.add_condition(or_conjunction) else: # 3rd case: we only have ands if '&&' in str(condition): ands = [] for cond_parts in str(condition).split('&&'): element_list = [element for element in cond_parts.split()] ands.append(condition_specification(child_name, element_list, configuration_space)) and_conjunction = AndConjunction(*ands) configuration_space.add_condition(and_conjunction) else: # 4th case: we have a normal condition element_list = [element for element in condition.split()] normal_condition = condition_specification(child_name, element_list, configuration_space) configuration_space.add_condition(normal_condition) return configuration_space
def read(pcs_string, debug=False): configuration_space = ConfigurationSpace() conditions = [] forbidden = [] # some statistics ct = 0 cont_ct = 0 cat_ct = 0 line_ct = 0 for line in pcs_string: line_ct += 1 if "#" in line: # It contains a comment pos = line.find("#") line = line[:pos] # Remove quotes and whitespaces at beginning and end line = line.replace('"', "").replace("'", "") line = line.strip() if "|" in line: # It's a condition try: c = pp_condition.parseString(line) conditions.append(c) except pyparsing.ParseException: raise NotImplementedError("Could not parse condition: %s" % line) continue if "}" not in line and "]" not in line: continue if line.startswith("{") and line.endswith("}"): forbidden.append(line) continue if len(line.strip()) == 0: continue ct += 1 param = None create = { "int": UniformIntegerHyperparameter, "float": UniformFloatHyperparameter, "categorical": CategoricalHyperparameter } try: param_list = pp_cont_param.parseString(line) il = param_list[9:] if len(il) > 0: il = il[0] param_list = param_list[:9] name = param_list[0] lower = float(param_list[2]) upper = float(param_list[4]) paramtype = "int" if "i" in il else "float" log = True if "l" in il else False default = float(param_list[7]) param = create[paramtype](name=name, lower=lower, upper=upper, q=None, log=log, default=default) cont_ct += 1 except pyparsing.ParseException: pass try: param_list = pp_cat_param.parseString(line) name = param_list[0] choices = [c for c in param_list[2:-4:2]] default = param_list[-2] param = create["categorical"](name=name, choices=choices, default=default) cat_ct += 1 except pyparsing.ParseException: pass if param is None: raise NotImplementedError("Could not parse: %s" % line) configuration_space.add_hyperparameter(param) for clause in forbidden: # TODO test this properly! # TODO Add a try/catch here! # noinspection PyUnusedLocal param_list = pp_forbidden_clause.parseString(clause) tmp_list = [] clause_list = [] for value in param_list[1:]: if len(tmp_list) < 3: tmp_list.append(value) else: # So far, only equals is supported by SMAC if tmp_list[1] == '=': # TODO maybe add a check if the hyperparameter is # actually in the configuration space clause_list.append( ForbiddenEqualsClause( configuration_space.get_hyperparameter( tmp_list[0]), tmp_list[2])) else: raise NotImplementedError() tmp_list = [] configuration_space.add_forbidden_clause( ForbiddenAndConjunction(*clause_list)) #Now handle conditions # If there are two conditions for one child, these two conditions are an # AND-conjunction of conditions, thus we have to connect them conditions_per_child = OrderedDict() for condition in conditions: child_name = condition[0] if child_name not in conditions_per_child: conditions_per_child[child_name] = list() conditions_per_child[child_name].append(condition) for child_name in conditions_per_child: condition_objects = [] for condition in conditions_per_child[child_name]: child = configuration_space.get_hyperparameter(child_name) parent_name = condition[2] parent = configuration_space.get_hyperparameter(parent_name) restrictions = condition[5:-1:2] # TODO: cast the type of the restriction! if len(restrictions) == 1: condition = EqualsCondition(child, parent, restrictions[0]) else: condition = InCondition(child, parent, values=restrictions) condition_objects.append(condition) # Now we have all condition objects for this child, so we can build a # giant AND-conjunction of them (if number of conditions >= 2)! if len(condition_objects) > 1: and_conjunction = AndConjunction(*condition_objects) configuration_space.add_condition(and_conjunction) else: configuration_space.add_condition(condition_objects[0]) return configuration_space
def add_params(cs: ConfigurationSpace): ''' adds parameters to ConfigurationSpace ''' try: classifier = cs.get_hyperparameter("classifier") classifier.choices.append("RandomForest") except KeyError: classifier = CategoricalHyperparameter("classifier", choices=["RandomForest"], default="RandomForest") cs.add_hyperparameter(classifier) n_estimators = UniformIntegerHyperparameter(name="rf:n_estimators", lower=10, upper=100, default=10, log=True) cs.add_hyperparameter(n_estimators) criterion = CategoricalHyperparameter(name="rf:criterion", choices=["gini", "entropy"], default="gini") cs.add_hyperparameter(criterion) max_features = CategoricalHyperparameter( name="rf:max_features", choices=["sqrt", "log2", None], default="sqrt") cs.add_hyperparameter(max_features) max_depth = UniformIntegerHyperparameter(name="rf:max_depth", lower=10, upper=2**31, default=2**31, log=True) cs.add_hyperparameter(max_depth) min_samples_split = UniformIntegerHyperparameter( name="rf:min_samples_split", lower=2, upper=100, default=2, log=True) cs.add_hyperparameter(min_samples_split) min_samples_leaf = UniformIntegerHyperparameter( name="rf:min_samples_leaf", lower=2, upper=100, default=10, log=True) cs.add_hyperparameter(min_samples_leaf) bootstrap = CategoricalHyperparameter(name="rf:bootstrap", choices=[True, False], default=True) cs.add_hyperparameter(bootstrap) cond = InCondition(child=n_estimators, parent=classifier, values=["RandomForest"]) cs.add_condition(cond) cond = InCondition(child=criterion, parent=classifier, values=["RandomForest"]) cs.add_condition(cond) cond = InCondition(child=max_features, parent=classifier, values=["RandomForest"]) cs.add_condition(cond) cond = InCondition(child=max_depth, parent=classifier, values=["RandomForest"]) cs.add_condition(cond) cond = InCondition(child=min_samples_split, parent=classifier, values=["RandomForest"]) cs.add_condition(cond) cond = InCondition(child=min_samples_leaf, parent=classifier, values=["RandomForest"]) cs.add_condition(cond) cond = InCondition(child=bootstrap, parent=classifier, values=["RandomForest"]) cs.add_condition(cond)
def _get_hyperparameter_search_space( self, dataset_properties: Dict[str, BaseDatasetPropertiesType], include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, ) -> ConfigurationSpace: """Create the hyperparameter configuration space. For the given steps, and the Choices within that steps, this procedure returns a configuration space object to explore. Args: include (Optional[Dict[str, Any]]): What hyper-parameter configurations to honor when creating the configuration space exclude (Optional[Dict[str, Any]]): What hyper-parameter configurations to remove from the configuration space dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Characteristics of the dataset to guide the pipeline choices of components Returns: cs (ConfigurationSpace): The configuration space describing the TabularClassificationPipeline. """ cs = ConfigurationSpace() if not isinstance(dataset_properties, dict): warnings.warn( 'The given dataset_properties argument contains an illegal value.' 'Proceeding with the default value') dataset_properties = dict() if 'target_type' not in dataset_properties: dataset_properties['target_type'] = 'tabular_classification' if dataset_properties['target_type'] != 'tabular_classification': warnings.warn( 'Tabular classification is being used, however the target_type' 'is not given as "tabular_classification". Overriding it.') dataset_properties['target_type'] = 'tabular_classification' # get the base search space given this # dataset properties. Then overwrite with custom # classification requirements cs = self._get_base_search_space(cs=cs, dataset_properties=dataset_properties, exclude=exclude, include=include, pipeline=self.steps) # Here we add custom code, that is used to ensure valid configurations, For example # Learned Entity Embedding is only valid when encoder is one hot encoder if 'network_embedding' in self.named_steps.keys( ) and 'encoder' in self.named_steps.keys(): embeddings = cs.get_hyperparameter( 'network_embedding:__choice__').choices if 'LearnedEntityEmbedding' in embeddings: encoders = cs.get_hyperparameter('encoder:__choice__').choices possible_default_embeddings = copy.copy(list(embeddings)) del possible_default_embeddings[ possible_default_embeddings.index( 'LearnedEntityEmbedding')] for encoder in encoders: if encoder == 'OneHotEncoder': continue while True: try: cs.add_forbidden_clause( ForbiddenAndConjunction( ForbiddenEqualsClause( cs.get_hyperparameter( 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), ForbiddenEqualsClause( cs.get_hyperparameter( 'encoder:__choice__'), encoder))) break except ValueError: # change the default and try again try: default = possible_default_embeddings.pop() except IndexError: raise ValueError( "Cannot find a legal default configuration" ) cs.get_hyperparameter( 'network_embedding:__choice__' ).default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties return cs
def __rely_model(self, cs: ConfigurationSpace): if not RelyModels.info: return all_models = list( cs.get_hyperparameter("estimator:__choice__").choices) rely_model_counter = Counter([x[0] for x in RelyModels.info]) # 依赖模式->所有相应模型 relied2AllModels = {} # 依赖模式->无交集相应模型 relied2models = {} for rely_model in rely_model_counter.keys(): _, hit = self.get_forbid_hit_in_models_by_rely( all_models, rely_model) relied2AllModels[rely_model] = hit # 如果某依赖模式不对应任何模型,删除 for k, v in list(relied2AllModels.items()): if not v: relied2AllModels.pop(k) rely_model_counter.pop(k) has_any_hit = any(relied2AllModels.values()) if not has_any_hit: return # 按照规则计算 relied2models : 无交集相应模型 relied_cnts_tuples = [(k, v) for k, v in rely_model_counter.items()] relied_cnts_tuples.sort(key=lambda x: x[-1]) visited = set() for rely_model, _ in relied_cnts_tuples: models = relied2AllModels[rely_model] for other in set(rely_model_counter.keys()) - {rely_model}: if (rely_model, other) in visited: continue other_models = relied2AllModels[other] if len(other_models) <= len(models): models = list(set(models) - set(other_models)) visited.add((rely_model, other)) visited.add((other, rely_model)) relied2models[rely_model] = models # 键的顺序遵循rely_model_counter.keys() def objective(relyModel2prob, debug=False): # relyModel2prob = {rely_model: prob for rely_model, prob in zip(list(rely_model_counter.keys()), args)} cur_cs = deepcopy(cs) self.set_probabilities_in_cs(cur_cs, relied2models, relied2AllModels, all_models, **relyModel2prob) cur_cs.seed(42) try: counter = Counter([ _hp.get("estimator:__choice__") for _hp in cur_cs.sample_configuration( len(all_models) * 15) ]) if debug: print(counter) except Exception: return np.inf vl = list(counter.values()) return np.var(vl) + 100 * (len(models) - len(vl)) space = {} eps = 0.001 N_rely_model = len(rely_model_counter.keys()) for rely_model in rely_model_counter.keys(): space[rely_model] = hp.uniform(rely_model, eps, (1 / N_rely_model) - eps) best = fmin( fn=objective, space=space, algo=tpe.suggest, max_evals=100, rstate=np.random.RandomState(42), show_progressbar=False, ) print("best =", best) objective(best, debug=True) self.set_probabilities_in_cs(cs, relied2models, relied2AllModels, all_models, **best)