def __init__(self, estimator=None, max_evals=50, algo='tpe', cv=5, handle_cv_failure=False, scoring='accuracy', best_score=0.0, max_opt_time=None, max_eval_time=None, pgo: Optional[PGO] = None, show_progressbar=True, args_to_scorer=None, verbose=False): self.max_evals = max_evals if estimator is None: self.estimator = LogisticRegression() else: self.estimator = estimator self.search_space = hyperopt.hp.choice( 'meta_model', [hyperopt_search_space(self.estimator, pgo=pgo)]) self.algo = algo self.scoring = scoring self.best_score = best_score self.handle_cv_failure = handle_cv_failure self.cv = cv self._trials = hyperopt.Trials() self.max_opt_time = max_opt_time self.max_eval_time = max_eval_time self.show_progressbar = show_progressbar if args_to_scorer is not None: self.args_to_scorer = args_to_scorer else: self.args_to_scorer = {} self.verbose = verbose
def __init__(self, model=None, max_evals=50, handle_cv_failure=False, pgo: Optional[PGO] = None): self.max_evals = max_evals if model is None: self.model = RandomForestRegressor else: self.model = model self.search_space = hp.choice( 'meta_model', [hyperopt_search_space(self.model, pgo=pgo)]) self.handle_cv_failure = handle_cv_failure self.trials = Trials()
def __init__(self, model=None, max_evals=50, cv=5, handle_cv_failure=False, pgo: Optional[PGO] = None): """ Instantiate the HyperoptClassifier that will use the given model and other parameters to select the best performing trainable instantiation of the model. This optimizer uses negation of accuracy_score as the performance metric to be minimized by Hyperopt. Parameters ---------- model : lale.operators.IndividualOp or lale.operators.Pipeline, optional A valid Lale individual operator or pipeline, by default None max_evals : int, optional Number of trials of Hyperopt search, by default 50 cv : an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices. Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5. Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here. The fit method performs cross validation on the input dataset for per trial, and uses the mean cross validation performance for optimization. This behavior is also impacted by handle_cv_failure flag, by default 5 handle_cv_failure : bool, optional A boolean flag to indicating how to deal with cross validation failure for a trial. If True, the trial is continued by doing a 80-20 percent train-validation split of the dataset input to fit and reporting the accuracy on the validation part. If False, the trial is terminated by assigning accuracy to zero. , by default False pgo : Optional[PGO], optional [description], by default None Raises ------ e [description] """ self.max_evals = max_evals if model is None: self.model = LogisticRegression else: self.model = model self.search_space = hp.choice( 'meta_model', [hyperopt_search_space(self.model, pgo=pgo)]) self.handle_cv_failure = handle_cv_failure self.cv = cv self.trials = Trials()
def __init__(self, estimator=None, max_evals=50, cv=5, handle_cv_failure=False, scoring='r2', best_score=1.0, max_opt_time=None, pgo: Optional[PGO] = None): self.max_evals = max_evals if estimator is None: self.estimator = RandomForestRegressor else: self.estimator = estimator self.search_space = hp.choice( 'meta_model', [hyperopt_search_space(self.estimator, pgo=pgo)]) self.scoring = scoring self.best_score = best_score self.handle_cv_failure = handle_cv_failure self.cv = cv self.trials = Trials() self.max_opt_time = max_opt_time
def __init__(self, estimator=None, max_evals=50, cv=5, handle_cv_failure=False, scoring='accuracy', best_score=0.0, max_opt_time=None, pgo: Optional[PGO] = None): """ Instantiate the HyperoptClassifier that will use the given estimator and other parameters to select the best performing trainable instantiation of the estimator. This optimizer uses negation of accuracy_score as the performance metric to be minimized by Hyperopt. Parameters ---------- estimator : lale.operators.IndividualOp or lale.operators.Pipeline, optional A valid Lale individual operator or pipeline, by default None max_evals : int, optional Number of trials of Hyperopt search, by default 50 cv : an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices. Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5. Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here. The fit method performs cross validation on the input dataset for per trial, and uses the mean cross validation performance for optimization. This behavior is also impacted by handle_cv_failure flag, by default 5 handle_cv_failure : bool, optional A boolean flag to indicating how to deal with cross validation failure for a trial. If True, the trial is continued by doing a 80-20 percent train-validation split of the dataset input to fit and reporting the score on the validation part. If False, the trial is terminated by assigning accuracy to zero. , by default False scoring: string or a scorer object created using https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer. A string from sklearn.metrics.SCORERS.keys() can be used or a scorer created from one of sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics). A completely custom scorer object can be created from a python function following the example at https://scikit-learn.org/stable/modules/model_evaluation.html The metric has to return a scalar value, and note that scikit-learns's scorer object always returns values such that higher score is better. Since Hyperopt solves a minimization problem, we negate the score value to pass to Hyperopt. by default 'accuracy'. best_score : float, optional The best score for the specified scorer. This allows us to return a loss to hyperopt that is greater than equal to zero, where zero is the best loss. By default, this is set to zero to follow current behavior. max_opt_time : float, optional Maximum amout of time in seconds for the optimization. By default, None, implying no runtime bound. pgo : Optional[PGO], optional [description], by default None Raises ------ e [description] Examples -------- >>> from sklearn.metrics import make_scorer, f1_score, accuracy_score >>> lr = LogisticRegression() >>> clf = HyperoptClassifier(estimator=lr, scoring='accuracy', cv=5, max_evals=2) >>> from sklearn import datasets >>> diabetes = datasets.load_diabetes() >>> X = diabetes.data[:150] >>> y = diabetes.target[:150] >>> trained = clf.fit(X, y) >>> predictions = trained.predict(X) Other scoring metrics: >>> clf = HyperoptClassifier(estimator=lr, scoring=make_scorer(f1_score, average='macro'), cv=3, max_evals=2) """ self.max_evals = max_evals if estimator is None: self.estimator = LogisticRegression else: self.estimator = estimator self.search_space = hp.choice( 'meta_model', [hyperopt_search_space(self.estimator, pgo=pgo)]) self.scoring = scoring self.best_score = best_score self.handle_cv_failure = handle_cv_failure self.cv = cv self.trials = Trials() self.max_opt_time = max_opt_time
def test_hyperparam_overriding_with_hyperopt(self): pca1 = PCA(n_components=3) pca2 = PCA() search_space1 = hyperopt_search_space(pca1) search_space2 = hyperopt_search_space(pca2) self.assertNotEqual(search_space1, search_space2)
def test_lr_parameters(self): pgo = PGO.load_pgo_file(example_pgo_fp) lr = LogisticRegression() parameters: SearchSpace = hyperopt_search_space(lr, pgo=pgo)
def fit(self, X_train, y_train): opt_start_time = time.time() is_clf = self.estimator.is_classifier() self.cv = check_cv(self.cv, y = y_train, classifier=is_clf) data_schema = lale.helpers.fold_schema( X_train, y_train, self.cv, is_clf) self.search_space = hyperopt.hp.choice( 'meta_model', [hyperopt_search_space(self.estimator, pgo=self.pgo, data_schema=data_schema)]) #Create a search space with default hyperparameters for all trainable parts of the pipeline. #This search space is used for `frac_evals_with_defaults` fraction of the total trials. try: self.search_space_with_defaults = hyperopt.hp.choice('meta_model', [hyperopt_search_space(self.estimator.freeze_trainable(), pgo=self.pgo, data_schema=data_schema)]) except: logger.warning(f"Exception caught during generation of default search space, setting frac_evals_with_defaults to zero.") self.evals_with_defaults = 0 def hyperopt_train_test(params, X_train, y_train): warnings.filterwarnings("ignore") trainable = create_instance_from_hyperopt_search_space(self.estimator, params) try: cv_score, logloss, execution_time = cross_val_score_track_trials(trainable, X_train, y_train, cv=self.cv, scoring=self.scoring, args_to_scorer=self.args_to_scorer) logger.debug("Successful trial of hyperopt with hyperparameters:{}".format(params)) except BaseException as e: #If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion if self.handle_cv_failure: X_train_part, X_validation, y_train_part, y_validation = train_test_split(X_train, y_train, test_size=0.20) start = time.time() trained = trainable.fit(X_train_part, y_train_part) scorer = check_scoring(trainable, scoring=self.scoring) cv_score = scorer(trained, X_validation, y_validation, **self.args_to_scorer) execution_time = time.time() - start y_pred_proba = trained.predict_proba(X_validation) try: logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba) except BaseException: logloss = 0 logger.debug("Warning, log loss cannot be computed") else: logger.debug(e) logger.debug("Error {} with pipeline:{}".format(e, trainable.to_json())) raise e return cv_score, logloss, execution_time def merge_trials(trials1, trials2): max_tid = max([trial['tid'] for trial in trials1.trials]) for trial in trials2: tid = trial['tid'] + max_tid + 1 hyperopt_trial = hyperopt.Trials().new_trial_docs( tids=[None], specs=[None], results=[None], miscs=[None]) hyperopt_trial[0] = trial hyperopt_trial[0]['tid'] = tid hyperopt_trial[0]['misc']['tid'] = tid for key in hyperopt_trial[0]['misc']['idxs'].keys(): hyperopt_trial[0]['misc']['idxs'][key] = [tid] trials1.insert_trial_docs(hyperopt_trial) trials1.refresh() return trials1 def proc_train_test(params, X_train, y_train, return_dict): return_dict['params'] = copy.deepcopy(params) try: score, logloss, execution_time = hyperopt_train_test(params, X_train=X_train, y_train=y_train) return_dict['loss'] = self.best_score - score return_dict['time'] = execution_time return_dict['log_loss'] = logloss return_dict['status'] = hyperopt.STATUS_OK except BaseException as e: logger.warning(f"Exception caught in Hyperopt:{type(e)}, {traceback.format_exc()} with hyperparams: {params}, setting status to FAIL") return_dict['status'] = hyperopt.STATUS_FAIL return_dict['error_msg'] = f"Exception caught in Hyperopt:{type(e)}, {traceback.format_exc()} with hyperparams: {params}" if self.verbose: print(return_dict['error_msg']) def get_final_trained_estimator(params, X_train, y_train): warnings.filterwarnings("ignore") trainable = create_instance_from_hyperopt_search_space(self.estimator, params) trained = trainable.fit(X_train, y_train) return trained def f(params): current_time = time.time() if (self.max_opt_time is not None) and ((current_time - opt_start_time) > self.max_opt_time) : # if max optimization time set, and we have crossed it, exit optimization completely sys.exit(0) if self.max_eval_time: # Run hyperopt in a subprocess that can be interupted manager = multiprocessing.Manager() proc_dict = manager.dict() p = multiprocessing.Process( target=proc_train_test, args=(params, X_train, y_train, proc_dict)) p.start() p.join(self.max_eval_time) if p.is_alive(): p.terminate() p.join() logger.warning(f"Maximum alloted evaluation time exceeded. with hyperparams: {params}, setting status to FAIL") proc_dict['status'] = hyperopt.STATUS_FAIL if 'status' not in proc_dict: logger.warning(f"Corrupted results, setting status to FAIL") proc_dict['status'] = hyperopt.STATUS_FAIL else: proc_dict = {} proc_train_test(params, X_train, y_train, proc_dict) return proc_dict algo = getattr(hyperopt, self.algo) #Search in the search space with defaults if self.evals_with_defaults > 0: try: hyperopt.fmin(f, self.search_space_with_defaults, algo=algo.suggest, max_evals=self.evals_with_defaults, trials=self._default_trials, rstate=np.random.RandomState(SEED), show_progressbar=self.show_progressbar) except SystemExit : logger.warning('Maximum alloted optimization time exceeded. Optimization exited prematurely') except AllTrialsFailed: self._best_estimator = None if hyperopt.STATUS_OK not in self._trials.statuses(): raise ValueError('Error from hyperopt, none of the trials succeeded.') try : hyperopt.fmin(f, self.search_space, algo=algo.suggest, max_evals=self.max_evals-self.evals_with_defaults, trials=self._trials, rstate=np.random.RandomState(SEED), show_progressbar=self.show_progressbar) except SystemExit : logger.warning('Maximum alloted optimization time exceeded. Optimization exited prematurely') except AllTrialsFailed: self._best_estimator = None if hyperopt.STATUS_OK not in self._trials.statuses(): raise ValueError('Error from hyperopt, none of the trials succeeded.') self._trials = merge_trials(self._trials, self._default_trials) try : best_trial = self._trials.best_trial val_loss = self._trials.best_trial['result']['loss'] if len(self._default_trials) > 0: default_val_loss = self._default_trials.best_trial['result']['loss'] if default_val_loss < val_loss: best_trial = self._default_trials.best_trial best_params = best_trial['result']['params'] logger.info( 'best score: {:.1%}\nbest hyperparams found using {} hyperopt trials: {}'.format( self.best_score - self._trials.average_best_error(), self.max_evals, best_params ) ) trained = get_final_trained_estimator(best_params, X_train, y_train) self._best_estimator = trained except BaseException as e : logger.warning('Unable to extract the best parameters from optimization, the error: {}'.format(e)) self._best_estimator = None return self
def test_hyperparam_defaults(self): trainable = J48() hyperopt_search_space(trainable)
del params['name'] clf = get_classifier(t, params) clf_trained = clf.fit(X_train, y_train) predictions = clf_trained.predict(X_test) accuracy = accuracy_score(y_test, [round(pred) for pred in predictions]) return accuracy def get_classifier(t, param_dict): if 'LogisticRegression' in t: clf = LogisticRegression(**param_dict) else: return 0 return clf search_space = hp.choice('classifier', [hyperopt_search_space(LogisticRegression)]) count = 0 best = 0 def f(params): global best, count count += 1 acc = hyperopt_train_test(params.copy()) if acc > best: print('new best:', acc, 'using', params['name']) best = acc if count % 1 == 0: print('iters:', count, ', acc:', acc, 'using', params) return {'loss': -acc, 'status': STATUS_OK} trials = Trials()