def dont_test_smac_choice(self): import numpy as np # Import ConfigSpace and different types of parameters from smac.configspace import ConfigurationSpace from smac.facade.smac_facade import SMAC as orig_SMAC from smac.scenario.scenario import Scenario # Import SMAC-utilities tfm = PCA() | Nystroem() | NoOp() planned_pipeline1 = ((OneHotEncoder(handle_unknown="ignore", sparse=False) | NoOp()) >> tfm >> (LogisticRegression() | KNeighborsClassifier())) cs: ConfigurationSpace = get_smac_space(planned_pipeline1, lale_num_grids=1) # Scenario object scenario = Scenario({ "run_obj": "quality", # we optimize quality (alternatively runtime) "runcount-limit": 1, # maximum function evaluations "cs": cs, # configuration space "deterministic": "true", }) # Optimize, using a SMAC-object tae = iris_fmin_tae(planned_pipeline1, num_folds=2) print( "Optimizing! Depending on your machine, this might take a few minutes." ) smac = orig_SMAC(scenario=scenario, rng=np.random.RandomState(42), tae_runner=tae) incumbent = smac.optimize() inc_value = tae(incumbent) print("Optimized Value: %.2f" % (inc_value))
def test_smac(self): import numpy as np # Import ConfigSpace and different types of parameters from smac.configspace import ConfigurationSpace from smac.facade.smac_facade import SMAC as orig_SMAC from smac.scenario.scenario import Scenario # Import SMAC-utilities from lale.search.lale_smac import get_smac_space lr = LogisticRegression() cs: ConfigurationSpace = get_smac_space(lr) # Scenario object scenario = Scenario({ "run_obj": "quality", # we optimize quality (alternatively runtime) "runcount-limit": 1, # maximum function evaluations "cs": cs, # configuration space "deterministic": "true", "abort_on_first_run_crash": False, }) # Optimize, using a SMAC-object tae = iris_fmin_tae(lr, num_folds=2) print( "Optimizing! Depending on your machine, this might take a few minutes." ) smac = orig_SMAC(scenario=scenario, rng=np.random.RandomState(42), tae_runner=tae) incumbent = smac.optimize() inc_value = tae(incumbent) print("Optimized Value: %.2f" % (inc_value))
def __init__(self, estimator=None, max_evals=50, cv=5, handle_cv_failure=False, scoring='accuracy', best_score=0.0, max_opt_time=None, lale_num_grids=None): """ Instantiate the SMAC that will use the given estimator and other parameters to select the best performing trainable instantiation of the estimator. Parameters ---------- estimator : lale.operators.IndividualOp or lale.operators.Pipeline, optional A valid Lale individual operator or pipeline, by default LogisticRegression max_evals : int, optional Number of trials of SMAC search i.e. runcount_limit of SMAC, by default 50 cv : an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices. Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5. Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here. The fit method performs cross validation on the input dataset for per trial, and uses the mean cross validation performance for optimization. This behavior is also impacted by handle_cv_failure flag, by default 5 handle_cv_failure : bool, optional A boolean flag to indicating how to deal with cross validation failure for a trial. If True, the trial is continued by doing a 80-20 percent train-validation split of the dataset input to fit and reporting the score on the validation part. If False, the trial is terminated by assigning status to FAIL. , by default False scoring: string or a scorer object created using https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer. A string from sklearn.metrics.SCORERS.keys() can be used or a scorer created from one of sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics). A completely custom scorer object can be created from a python function following the example at https://scikit-learn.org/stable/modules/model_evaluation.html The metric has to return a scalar value, and note that scikit-learns's scorer object always returns values such that higher score is better. Since Hyperopt solves a minimization problem, we pass (best_score - score) to Hyperopt. by default 'accuracy'. best_score : float, optional The best score for the specified scorer. This allows us to return a loss to hyperopt that is greater than equal to zero, where zero is the best loss. By default, zero. max_opt_time : float, optional Maximum amount of wall clock time in seconds for the optimization. By default, None, implying no runtime bound. Examples -------- >>> from sklearn.metrics import make_scorer, f1_score, accuracy_score >>> lr = LogisticRegression() >>> clf = SMAC(estimator=lr, scoring='accuracy', cv=5) >>> from sklearn import datasets >>> diabetes = datasets.load_diabetes() >>> X = diabetes.data[:150] >>> y = diabetes.target[:150] >>> trained = clf.fit(X, y) >>> predictions = trained.predict(X) Other scoring metrics: >>> clf = SMAC(estimator=lr, scoring=make_scorer(f1_score, average='macro'), cv=3, max_evals=2) """ self.max_evals = max_evals if estimator is None: self.estimator = LogisticRegression() else: self.estimator = estimator self.search_space: ConfigurationSpace = get_smac_space( self.estimator, lale_num_grids=lale_num_grids) self.scoring = scoring self.best_score = best_score self.handle_cv_failure = handle_cv_failure self.cv = cv self.max_opt_time = max_opt_time # Scenario object scenario_options = { "run_obj": "quality", # we optimize quality (alternatively runtime) "runcount-limit": self.max_evals, # maximum function evaluations "cs": self.search_space, # configuration space "deterministic": "true", "abort_on_first_run_crash": False, } if max_opt_time is not None: scenario_options["wallclock_limit"] = max_opt_time self.scenario = Scenario(scenario_options) self.trials = None
def fit(self, X_train, y_train): data_schema = lale.helpers.fold_schema(X_train, y_train, self.cv, self.estimator.is_classifier()) self.search_space: ConfigurationSpace = get_smac_space( self.estimator, lale_num_grids=self.lale_num_grids, data_schema=data_schema) # Scenario object scenario_options = { "run_obj": "quality", # optimize quality (alternatively runtime) "runcount-limit": self.max_evals, # maximum function evaluations "cs": self.search_space, # configuration space "deterministic": "true", "abort_on_first_run_crash": False, } if self.max_opt_time is not None: scenario_options["wallclock_limit"] = self.max_opt_time self.scenario = Scenario(scenario_options) self.cv = check_cv(self.cv, y=y_train, classifier=self.estimator.is_classifier()) def smac_train_test(trainable, X_train, y_train): try: cv_score, logloss, execution_time = cross_val_score_track_trials( trainable, X_train, y_train, cv=self.cv, scoring=self.scoring) logger.debug("Successful trial of SMAC") except BaseException as e: # If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion if self.handle_cv_failure: ( X_train_part, X_validation, y_train_part, y_validation, ) = train_test_split(X_train, y_train, test_size=0.20) start = time.time() trained = trainable.fit(X_train_part, y_train_part) scorer = check_scoring(trainable, scoring=self.scoring) cv_score = scorer(trained, X_validation, y_validation) execution_time = time.time() - start y_pred_proba = trained.predict_proba(X_validation) try: logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba) except BaseException: logloss = 0 logger.debug("Warning, log loss cannot be computed") else: logger.debug("Error {} with pipeline:{}".format( e, trainable.to_json())) raise e return cv_score, logloss, execution_time def f(trainable): return_dict = {} try: score, logloss, execution_time = smac_train_test( trainable, X_train=X_train, y_train=y_train) return_dict = { "loss": self.best_score - score, "time": execution_time, "log_loss": logloss, } except BaseException as e: logger.warning( f"Exception caught in SMACCV:{type(e)}, {traceback.format_exc()}, SMAC will set a cost_for_crash to MAXINT." ) raise e return return_dict["loss"] try: smac = orig_SMAC( scenario=self.scenario, rng=np.random.RandomState(42), tae_runner=lale_op_smac_tae(self.estimator, f), ) incumbent = smac.optimize() self.trials = smac.get_runhistory() trainable = lale_trainable_op_from_config(self.estimator, incumbent) # get the trainable corresponding to the best params and train it on the entire training dataset. trained = trainable.fit(X_train, y_train) self._best_estimator = trained except BudgetExhaustedException: logger.warning( "Maximum alloted optimization time exceeded. Optimization exited prematurely" ) except BaseException as e: logger.warning("Error during optimization: {}".format(e)) self._best_estimator = None return self
def dont_test_car_smac(self): import numpy as np from lale.datasets.auto_weka import fetch_car from sklearn.metrics import accuracy_score, make_scorer from sklearn.preprocessing import LabelEncoder import pandas as pd from lale.lib.weka import J48 from lalegpl.lib.r import ArulesCBAClassifier from lale.operators import make_pipeline from lale.lib.lale import HyperoptClassifier from lale.lib.sklearn import LogisticRegression, KNeighborsClassifier from smac.scenario.scenario import Scenario from smac.facade.smac_facade import SMAC from smac.configspace import ConfigurationSpace (X_train, y_train), (X_test, y_test) = fetch_car() y_name = y_train.name le = LabelEncoder() y_train = le.fit_transform(y_train) y_test = le.transform(y_test) y_train = pd.Series(y_train, name=y_name) y_test = pd.Series(y_test, name=y_name) # planned_pipeline = make_pipeline(J48() | ArulesCBAClassifier() | LogisticRegression() | KNeighborsClassifier()) planned_pipeline = make_pipeline(ArulesCBAClassifier() | KNeighborsClassifier() | LogisticRegression()) cs:ConfigurationSpace = get_smac_space(planned_pipeline) print(cs) # X_train = X_train[0:20] # y_train = y_train[0:20] # Scenario object run_count_limit = 1 scenario = Scenario({"run_obj": "quality", # we optimize quality (alternatively runtime) "runcount-limit": run_count_limit, # maximum function evaluations "cs": cs, # configuration space "deterministic": "true", "abort_on_first_run_crash": False }) # Optimize, using a SMAC-object def f_min(op): return test_f_min(op, X_train, y_train, num_folds=2) tae = lale_op_smac_tae(planned_pipeline, f_min) print("Optimizing! Depending on your machine, this might take a few minutes.") smac = SMAC(scenario=scenario, rng=np.random.RandomState(42), tae_runner=tae) incumbent = smac.optimize() trainable_pipeline = lale_trainable_op_from_config(planned_pipeline, incumbent) trained_pipeline = trainable_pipeline.fit(X_train, y_train) pred = trained_pipeline.predict(X_test) accuracy = accuracy_score(y_test, pred) print("Accuracy: %.2f" % (accuracy)) inc_value = tae(incumbent) print("Optimized Value: %.2f" % (inc_value)) print(f"Run count limit: {run_count_limit}")