class _NSGA2Impl: def __init__( self, estimator=None, scoring=None, best_score=0.0, cv=5, max_evals=50, max_opt_time=None, population_size=10, random_seed=42, ): if estimator is None: self.model = LogisticRegression() else: self.model = estimator assert isinstance(self.model, lale.operators.IndividualOp), ( "Multi-objective optimization is supported for only " "Individual Operators currently and not supported over Pipelines.") logger.info( f"Optimizing model {self.model} with type {type(self.model)}") logger.info("Lale param ranges - \n" f"{self.model.get_param_ranges()}") self.model_helper = _ModelHelper(self.model) self.moo_solutions = [] self.scoring = scoring assert self.scoring is not None, "scoring parameter not specified." assert len( self.scoring) >= 2, "Less than two scorers specified in scoring" if isinstance(best_score, list): if len(best_score) < len(scoring): best_score.extend([0.0] * (len(scoring) - len(best_score))) self.best_score = best_score else: self.best_score = [best_score] * len(scoring) self.cv = cv self.max_evals = max_evals self.max_opt_time = max_opt_time self.population_size = population_size self.random_seed = random_seed @classmethod def validate_hyperparams(cls, scoring=None, best_score=0, **hyperparams): check_scoring_best_score_constraint(scoring, best_score) # Internal class class Soln(object): def __init__(self, variables, objectives): self.variables = variables self.objectives = objectives # convert parameter list to dictionary def param_to_dict(self, parameter, param_choices, param_categories, param_type): temp = {} i = 0 for key in param_choices: if key not in param_categories.keys( ): # if non-categorical parameter if param_type[key] == "boolean": temp[key] = parameter[i][0] else: temp[key] = parameter[i] else: temp[key] = param_categories[key][parameter[i]] i += 1 return temp def fit(self, X, y): opt_start_time = time.time() kfold = None if isinstance(self.cv, int) and self.cv == 1: X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.2, random_state=self.random_seed, stratify=y) logger.info("Not using Cross-Validation. " "Performing single train/test split") else: is_clf = self.model.is_classifier() kfold = check_cv(self.cv, y=y, classifier=is_clf) # kfold = StratifiedKFold( # n_splits=self.cv, random_state=self.random_seed, shuffle=True # ) logger.info(f"Using Cross-Validation - {kfold}") self.ind = 0 def train_test_model(parameter): # First check if we exceeded allocated time budget current_time = time.time() elapsed_time = current_time - opt_start_time if (self.max_opt_time is not None) and (elapsed_time > self.max_opt_time): msg = ( f"Max optimization time exceeded. " f"Max Opt time = {self.max_opt_time}, Elapsed Time = {elapsed_time}, " f"NFE Completed - {self.ind}") raise MaxBudgetExceededException(msg) self.ind = self.ind + 1 logger.info(f"Training population {self.ind}") parameter = self.param_to_dict( parameter, self.model_helper.param_choices, self.model_helper.param_categories, self.model_helper.param_type, ) scorers = [get_scorer(scorer) for scorer in self.scoring] nscorers = len(scorers) try: if kfold is None: clf = self.model_helper.create_instance(parameter) clf_trained = clf.fit(X_train, y_train) obj_val = [ scorer(clf_trained, X_val, y_val) for scorer in scorers ] else: obj_scores = [[] for _ in range(nscorers)] # Perform k-fold cross-validation for train_index, test_index in kfold.split(X, y): if isinstance(X, pd.DataFrame): X_train_split, X_val_split = ( X.iloc[train_index], X.iloc[test_index], ) y_train_split, y_val_split = ( y.iloc[train_index], y.iloc[test_index], ) else: X_train_split, X_val_split = X[train_index], X[ test_index] y_train_split, y_val_split = y[train_index], y[ test_index] clf = self.model_helper.create_instance(parameter) clf_trained = clf.fit(X_train_split, y_train_split) obj_score = [ scorer(clf_trained, X_val_split, y_val_split) for scorer in scorers ] for i in range(nscorers): obj_scores[i].append(obj_score[i]) # Aggregate CV score obj_val = [np.mean(obj_scores[i]) for i in range(nscorers)] logger.debug(f"Obj k-fold scores - {obj_scores}") # By default we are solving a minimization MOO problem fitnessValue = [ self.best_score[i] - obj_val[i] for i in range(nscorers) ] logger.info(f"Train fitnessValue - {fitnessValue}") except jsonschema.ValidationError as e: logger.error(f"Caught JSON schema validation error.\n{e}") logger.error("Setting fitness (loss) values to infinity") fitnessValue = [np.inf for i in range(nscorers)] logger.info(f"Train fitnessValue - {fitnessValue}") return fitnessValue def time_check_callback(alg): current_time = time.time() elapsed_time = current_time - opt_start_time logger.info( f"NFE Complete - {alg.nfe}, Elapsed Time - {elapsed_time}") parameter_num = len(self.model_helper.param_choices) target_num = len(self.scoring) # Adjust max_evals if not a multiple of population size. This is # required as Platypus performs evaluations in multiples of # population_size. adjusted_max_evals = (self.max_evals // self.population_size) * self.population_size if adjusted_max_evals != self.max_evals: logger.info( f"Adjusting max_evals to {adjusted_max_evals} from specified {self.max_evals}" ) problem = Problem(parameter_num, target_num) problem.types[:] = self.model_helper.types problem.function = train_test_model # Set the variator based on types of decision variables varg = {} first_type = problem.types[0].__class__ all_type_same = all([isinstance(t, first_type) for t in problem.types]) # use compound operator for mixed types if not all_type_same: varg["variator"] = CompoundOperator(SBX(), HUX(), PM(), BitFlip()) algorithm = NSGAII( problem, population_size=self.population_size, **varg, ) try: algorithm.run(adjusted_max_evals, callback=time_check_callback) except MaxBudgetExceededException as e: logger.warning( f"Max optimization time budget exceeded. Optimization exited prematurely.\n{e}" ) solutions = nondominated(algorithm.result) # solutions = [s for s in algorithm.result if s.feasible]` # solutions = algorithm.result moo_solutions = [] for solution in solutions: vars = [] for pnum in range(parameter_num): vars.append(problem.types[pnum].decode( solution.variables[pnum])) vars_dict = self.param_to_dict( vars, self.model_helper.param_choices, self.model_helper.param_categories, self.model_helper.param_type, ) moo_solutions.append(self.Soln(vars_dict, solution.objectives)) logger.info(f"{vars}, {solution.objectives}") self.moo_solutions = moo_solutions pareto_models = [] for solution in self.moo_solutions: est = self.model_helper.create_instance(solution.variables) est_trained = est.fit(X, y) pareto_models.append((solution.variables, est_trained)) self.pareto_models = pareto_models return self def get_pareto_solutions(self): return self.moo_solutions def get_pareto_models(self): return self.pareto_models # Predict using first pareto-optimal estimator def predict(self, X, **kwargs): if "pipeline_name" in kwargs: pname = kwargs["pipeline_name"] pipeline = self.get_pipeline(pipeline_name=pname) del kwargs["pipeline_name"] else: pipeline = self.get_pipeline() return pipeline.predict(X, **kwargs) # Return pareto-optimal estimator def get_pipeline(self, pipeline_name=None, astype="lale"): """Retrieve one of the pareto-optimal pipelines. Parameters ---------- pipeline_name : union type, default None - string Key (name) from the table returned by summary(), return a trained pipeline. - None When not specified, return the first (trained) pipeline in the table returned by summary() astype : 'lale' or 'sklearn', default 'lale' Type of resulting pipeline. Returns ------- result : Trained operator.""" id = 0 if pipeline_name is not None: id = int(pipeline_name[1:]) assert 0 < len(self.pareto_models), "No pipelines found" assert id < len(self.pareto_models), "Invalid pipeline name" vars, pareto_model = self.pareto_models[id] result = pareto_model if astype == "lale": return result assert astype == "sklearn", "Invalid astype " + astype if hasattr(result, "export_to_sklearn_pipeline"): result = result.export_to_sklearn_pipeline() else: logger.warning("Cannot return sklearn pipeline.") return result def summary(self): """Table displaying the pareto-optimal solutions (pipelines) obtained after multi-objective optimization (name, ID, loss for each specified scorer). Returns ------- result : DataFrame""" nsolutions = len(self.moo_solutions) nscoring = len(self.scoring) records = [] for isol in range(nsolutions): record_dict = {} record_dict["name"] = f"p{isol}" record_dict["id"] = isol for iobj in range(nscoring): solution = self.moo_solutions[isol] record_dict[f"loss{iobj+1}"] = solution.objectives[iobj] records.append(record_dict) result = pd.DataFrame.from_records(records, index="name") return result
class HyperoptImpl: def __init__(self, estimator=None, max_evals=50, frac_evals_with_defaults = 0, algo='tpe', cv=5, handle_cv_failure=False, scoring='accuracy', best_score=0.0, max_opt_time=None, max_eval_time=None, pgo:Optional[PGO]=None, show_progressbar=True, args_to_scorer=None, verbose=False): self.max_evals = max_evals if estimator is None: self.estimator = LogisticRegression() else: self.estimator = estimator if frac_evals_with_defaults > 0: self.evals_with_defaults = int(frac_evals_with_defaults*max_evals) else: self.evals_with_defaults = 0 self.algo = algo self.scoring = scoring self.best_score = best_score self.handle_cv_failure = handle_cv_failure self.cv = cv self._trials = hyperopt.Trials() self._default_trials = hyperopt.Trials() self.max_opt_time = max_opt_time self.max_eval_time = max_eval_time self.pgo = pgo self.show_progressbar = show_progressbar if args_to_scorer is not None: self.args_to_scorer = args_to_scorer else: self.args_to_scorer = {} self.verbose = verbose def fit(self, X_train, y_train): opt_start_time = time.time() is_clf = self.estimator.is_classifier() self.cv = check_cv(self.cv, y = y_train, classifier=is_clf) data_schema = lale.helpers.fold_schema( X_train, y_train, self.cv, is_clf) self.search_space = hyperopt.hp.choice( 'meta_model', [hyperopt_search_space(self.estimator, pgo=self.pgo, data_schema=data_schema)]) #Create a search space with default hyperparameters for all trainable parts of the pipeline. #This search space is used for `frac_evals_with_defaults` fraction of the total trials. try: self.search_space_with_defaults = hyperopt.hp.choice('meta_model', [hyperopt_search_space(self.estimator.freeze_trainable(), pgo=self.pgo, data_schema=data_schema)]) except: logger.warning(f"Exception caught during generation of default search space, setting frac_evals_with_defaults to zero.") self.evals_with_defaults = 0 def hyperopt_train_test(params, X_train, y_train): warnings.filterwarnings("ignore") trainable = create_instance_from_hyperopt_search_space(self.estimator, params) try: cv_score, logloss, execution_time = cross_val_score_track_trials(trainable, X_train, y_train, cv=self.cv, scoring=self.scoring, args_to_scorer=self.args_to_scorer) logger.debug("Successful trial of hyperopt with hyperparameters:{}".format(params)) except BaseException as e: #If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion if self.handle_cv_failure: X_train_part, X_validation, y_train_part, y_validation = train_test_split(X_train, y_train, test_size=0.20) start = time.time() trained = trainable.fit(X_train_part, y_train_part) scorer = check_scoring(trainable, scoring=self.scoring) cv_score = scorer(trained, X_validation, y_validation, **self.args_to_scorer) execution_time = time.time() - start y_pred_proba = trained.predict_proba(X_validation) try: logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba) except BaseException: logloss = 0 logger.debug("Warning, log loss cannot be computed") else: logger.debug(e) logger.debug("Error {} with pipeline:{}".format(e, trainable.to_json())) raise e return cv_score, logloss, execution_time def merge_trials(trials1, trials2): max_tid = max([trial['tid'] for trial in trials1.trials]) for trial in trials2: tid = trial['tid'] + max_tid + 1 hyperopt_trial = hyperopt.Trials().new_trial_docs( tids=[None], specs=[None], results=[None], miscs=[None]) hyperopt_trial[0] = trial hyperopt_trial[0]['tid'] = tid hyperopt_trial[0]['misc']['tid'] = tid for key in hyperopt_trial[0]['misc']['idxs'].keys(): hyperopt_trial[0]['misc']['idxs'][key] = [tid] trials1.insert_trial_docs(hyperopt_trial) trials1.refresh() return trials1 def proc_train_test(params, X_train, y_train, return_dict): return_dict['params'] = copy.deepcopy(params) try: score, logloss, execution_time = hyperopt_train_test(params, X_train=X_train, y_train=y_train) return_dict['loss'] = self.best_score - score return_dict['time'] = execution_time return_dict['log_loss'] = logloss return_dict['status'] = hyperopt.STATUS_OK except BaseException as e: logger.warning(f"Exception caught in Hyperopt:{type(e)}, {traceback.format_exc()} with hyperparams: {params}, setting status to FAIL") return_dict['status'] = hyperopt.STATUS_FAIL return_dict['error_msg'] = f"Exception caught in Hyperopt:{type(e)}, {traceback.format_exc()} with hyperparams: {params}" if self.verbose: print(return_dict['error_msg']) def get_final_trained_estimator(params, X_train, y_train): warnings.filterwarnings("ignore") trainable = create_instance_from_hyperopt_search_space(self.estimator, params) trained = trainable.fit(X_train, y_train) return trained def f(params): current_time = time.time() if (self.max_opt_time is not None) and ((current_time - opt_start_time) > self.max_opt_time) : # if max optimization time set, and we have crossed it, exit optimization completely sys.exit(0) if self.max_eval_time: # Run hyperopt in a subprocess that can be interupted manager = multiprocessing.Manager() proc_dict = manager.dict() p = multiprocessing.Process( target=proc_train_test, args=(params, X_train, y_train, proc_dict)) p.start() p.join(self.max_eval_time) if p.is_alive(): p.terminate() p.join() logger.warning(f"Maximum alloted evaluation time exceeded. with hyperparams: {params}, setting status to FAIL") proc_dict['status'] = hyperopt.STATUS_FAIL if 'status' not in proc_dict: logger.warning(f"Corrupted results, setting status to FAIL") proc_dict['status'] = hyperopt.STATUS_FAIL else: proc_dict = {} proc_train_test(params, X_train, y_train, proc_dict) return proc_dict algo = getattr(hyperopt, self.algo) #Search in the search space with defaults if self.evals_with_defaults > 0: try: hyperopt.fmin(f, self.search_space_with_defaults, algo=algo.suggest, max_evals=self.evals_with_defaults, trials=self._default_trials, rstate=np.random.RandomState(SEED), show_progressbar=self.show_progressbar) except SystemExit : logger.warning('Maximum alloted optimization time exceeded. Optimization exited prematurely') except AllTrialsFailed: self._best_estimator = None if hyperopt.STATUS_OK not in self._trials.statuses(): raise ValueError('Error from hyperopt, none of the trials succeeded.') try : hyperopt.fmin(f, self.search_space, algo=algo.suggest, max_evals=self.max_evals-self.evals_with_defaults, trials=self._trials, rstate=np.random.RandomState(SEED), show_progressbar=self.show_progressbar) except SystemExit : logger.warning('Maximum alloted optimization time exceeded. Optimization exited prematurely') except AllTrialsFailed: self._best_estimator = None if hyperopt.STATUS_OK not in self._trials.statuses(): raise ValueError('Error from hyperopt, none of the trials succeeded.') self._trials = merge_trials(self._trials, self._default_trials) try : best_trial = self._trials.best_trial val_loss = self._trials.best_trial['result']['loss'] if len(self._default_trials) > 0: default_val_loss = self._default_trials.best_trial['result']['loss'] if default_val_loss < val_loss: best_trial = self._default_trials.best_trial best_params = best_trial['result']['params'] logger.info( 'best score: {:.1%}\nbest hyperparams found using {} hyperopt trials: {}'.format( self.best_score - self._trials.average_best_error(), self.max_evals, best_params ) ) trained = get_final_trained_estimator(best_params, X_train, y_train) self._best_estimator = trained except BaseException as e : logger.warning('Unable to extract the best parameters from optimization, the error: {}'.format(e)) self._best_estimator = None return self def predict(self, X_eval): import warnings warnings.filterwarnings("ignore") if self._best_estimator is None: raise ValueError("Can not predict as the best estimator is None. Either an attempt to call `predict` " "before calling `fit` or all the trials during `fit` failed.") trained = self._best_estimator try: predictions = trained.predict(X_eval) except ValueError as e: logger.warning("ValueError in predicting using Hyperopt:{}, the error is:{}".format(trained, e)) predictions = None return predictions def summary(self): """Table summarizing the trial results (ID, loss, time, log_loss, status). Returns ------- result : DataFrame""" def make_record(trial_dict): try: loss = trial_dict['result']['loss'] except BaseException: loss = np.nan try: time = trial_dict['result']['time'] except BaseException: time = '-' try: log_loss = trial_dict['result']['log_loss'] except BaseException: log_loss = np.nan return { 'name': f'p{trial_dict["tid"]}', 'tid': trial_dict['tid'], 'loss': trial_dict['result'].get('loss', float('nan')), 'time': trial_dict['result'].get('time', float('nan')), 'log_loss': trial_dict['result'].get('log_loss', float('nan')), 'status': trial_dict['result']['status']} records = [make_record(td) for td in self._trials.trials] result = pd.DataFrame.from_records(records, index='name') return result def get_pipeline(self, pipeline_name=None, astype='lale'): """Retrieve one of the trials. Parameters ---------- pipeline_name : union type, default None - string Key for table returned by summary(), return a trainable pipeline. - None When not specified, return the best trained pipeline found. astype : 'lale' or 'sklearn', default 'lale' Type of resulting pipeline. Returns ------- result : Trained operator if best, trainable operator otherwise. """ if pipeline_name is None: result = getattr(self, '_best_estimator', None) else: tid = int(pipeline_name[1:]) params = self._trials.trials[tid]['result']['params'] result = create_instance_from_hyperopt_search_space( self.estimator, params) if result is None or astype == 'lale': return result assert astype == 'sklearn', astype return result.export_to_sklearn_pipeline()
class _SMACImpl: def __init__( self, estimator=None, max_evals=50, cv=5, handle_cv_failure=False, scoring=None, best_score=0.0, max_opt_time=None, lale_num_grids=None, ): assert smac_installed, """Your Python environment does not have smac installed. You can install it with pip install smac<=0.10.0 or with pip install 'lale[full]'""" self.max_evals = max_evals if estimator is None: self.estimator = LogisticRegression() else: self.estimator = estimator self.scoring = scoring if self.scoring is None: is_clf = self.estimator.is_classifier() if is_clf: self.scoring = "accuracy" else: self.scoring = "r2" self.best_score = best_score self.handle_cv_failure = handle_cv_failure self.cv = cv self.max_opt_time = max_opt_time self.lale_num_grids = lale_num_grids self.trials = None def fit(self, X_train, y_train): data_schema = lale.helpers.fold_schema(X_train, y_train, self.cv, self.estimator.is_classifier()) self.search_space: ConfigurationSpace = get_smac_space( self.estimator, lale_num_grids=self.lale_num_grids, data_schema=data_schema) # Scenario object scenario_options = { "run_obj": "quality", # optimize quality (alternatively runtime) "runcount-limit": self.max_evals, # maximum function evaluations "cs": self.search_space, # configuration space "deterministic": "true", "abort_on_first_run_crash": False, } if self.max_opt_time is not None: scenario_options["wallclock_limit"] = self.max_opt_time self.scenario = Scenario(scenario_options) self.cv = check_cv(self.cv, y=y_train, classifier=self.estimator.is_classifier()) def smac_train_test(trainable, X_train, y_train): try: cv_score, logloss, execution_time = cross_val_score_track_trials( trainable, X_train, y_train, cv=self.cv, scoring=self.scoring) logger.debug("Successful trial of SMAC") except BaseException as e: # If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion if self.handle_cv_failure: ( X_train_part, X_validation, y_train_part, y_validation, ) = train_test_split(X_train, y_train, test_size=0.20) start = time.time() trained = trainable.fit(X_train_part, y_train_part) scorer = check_scoring(trainable, scoring=self.scoring) cv_score = scorer(trained, X_validation, y_validation) execution_time = time.time() - start y_pred_proba = trained.predict_proba(X_validation) try: logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba) except BaseException: logloss = 0 logger.debug("Warning, log loss cannot be computed") else: logger.debug("Error {} with pipeline:{}".format( e, trainable.to_json())) raise e return cv_score, logloss, execution_time def f(trainable): return_dict = {} try: score, logloss, execution_time = smac_train_test( trainable, X_train=X_train, y_train=y_train) return_dict = { "loss": self.best_score - score, "time": execution_time, "log_loss": logloss, } except BaseException as e: logger.warning( f"Exception caught in SMACCV:{type(e)}, {traceback.format_exc()}, SMAC will set a cost_for_crash to MAXINT." ) raise e return return_dict["loss"] try: smac = orig_SMAC( scenario=self.scenario, rng=np.random.RandomState(42), tae_runner=lale_op_smac_tae(self.estimator, f), ) incumbent = smac.optimize() self.trials = smac.get_runhistory() trainable = lale_trainable_op_from_config(self.estimator, incumbent) # get the trainable corresponding to the best params and train it on the entire training dataset. trained = trainable.fit(X_train, y_train) self._best_estimator = trained except BudgetExhaustedException: logger.warning( "Maximum alloted optimization time exceeded. Optimization exited prematurely" ) except BaseException as e: logger.warning("Error during optimization: {}".format(e)) self._best_estimator = None return self def predict(self, X_eval): import warnings warnings.filterwarnings("ignore") trained = self._best_estimator if trained is None: logger.warning( "Could not get trained best estimator when predicting using SMACCV:{}, the error is" ) return None try: predictions = trained.predict(X_eval) except ValueError as e: logger.warning( "ValueError in predicting using SMACCV:{}, the error is:{}". format(trained, e)) predictions = None return predictions def get_trials(self): """Returns the trials i.e. RunHistory object. Returns ------- smac.runhistory.runhistory.RunHistory RunHistory of all the trials executed during the optimization i.e. fit method of SMACCV. """ return self.trials def get_pipeline(self, pipeline_name=None, astype="lale"): if pipeline_name is not None: raise NotImplementedError("Cannot get pipeline by name yet.") result = getattr(self, "_best_estimator", None) if result is None or astype == "lale": return result assert astype == "sklearn", astype # TODO: should this try and return an actual sklearn pipeline? return result
class _HyperoptImpl: def __init__( self, *, estimator=None, scoring=None, best_score=0.0, args_to_scorer=None, cv=5, handle_cv_failure=False, verbose=False, show_progressbar=True, algo="tpe", max_evals=50, frac_evals_with_defaults=0, max_opt_time=None, max_eval_time=None, pgo: Optional[PGO] = None, ): self.max_evals = max_evals if estimator is None: self.estimator = LogisticRegression() else: self.estimator = estimator if frac_evals_with_defaults > 0: self.evals_with_defaults = int(frac_evals_with_defaults * max_evals) else: self.evals_with_defaults = 0 self.algo = algo self.scoring = scoring if self.scoring is None: is_clf = self.estimator.is_classifier() if is_clf: self.scoring = "accuracy" else: self.scoring = "r2" self.best_score = best_score self.handle_cv_failure = handle_cv_failure self.cv = cv self._trials = hyperopt.Trials() self._default_trials = hyperopt.Trials() self.max_opt_time = max_opt_time self.max_eval_time = max_eval_time self.pgo = pgo self.show_progressbar = show_progressbar if args_to_scorer is not None: self.args_to_scorer = args_to_scorer else: self.args_to_scorer = {} self.verbose = verbose def _summarize_statuses(self): status_list = self._trials.statuses() status_hist = {} for status in status_list: status_hist[status] = 1 + status_hist.get(status, 0) if hyperopt.STATUS_FAIL in status_hist: print( f"{status_hist[hyperopt.STATUS_FAIL]} out of {len(status_list)} trials failed, call summary() for details." ) if not self.verbose: print("Run with verbose=True to see per-trial exceptions.") def fit(self, X_train, y_train, **fit_params): opt_start_time = time.time() is_clf = self.estimator.is_classifier() self.cv = check_cv(self.cv, y=y_train, classifier=is_clf) data_schema = lale.helpers.fold_schema(X_train, y_train, self.cv, is_clf) self.search_space = hyperopt.hp.choice( "meta_model", [ hyperopt_search_space( self.estimator, pgo=self.pgo, data_schema=data_schema ) ], ) # Create a search space with default hyperparameters for all trainable parts of the pipeline. # This search space is used for `frac_evals_with_defaults` fraction of the total trials. try: self.search_space_with_defaults = hyperopt.hp.choice( "meta_model", [ hyperopt_search_space( self.estimator.freeze_trainable(), pgo=self.pgo, data_schema=data_schema, ) ], ) except Exception: logger.warning( "Exception caught during generation of default search space, setting frac_evals_with_defaults to zero." ) self.evals_with_defaults = 0 def hyperopt_train_test(params, X_train, y_train): warnings.filterwarnings("ignore") trainable = create_instance_from_hyperopt_search_space( self.estimator, params ) try: cv_score, logloss, execution_time = cross_val_score_track_trials( trainable, X_train, y_train, cv=self.cv, scoring=self.scoring, args_to_scorer=self.args_to_scorer, ) logger.debug( "Successful trial of hyperopt with hyperparameters:{}".format( params ) ) except BaseException as e: # If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion if self.handle_cv_failure and trainable is not None: ( X_train_part, X_validation, y_train_part, y_validation, ) = train_test_split(X_train, y_train, test_size=0.20) start = time.time() trained = trainable.fit(X_train_part, y_train_part, **fit_params) scorer = check_scoring(trainable, scoring=self.scoring) cv_score = scorer( trained, X_validation, y_validation, **self.args_to_scorer ) execution_time = time.time() - start y_pred_proba = trained.predict_proba(X_validation) try: logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba) except BaseException: logloss = 0 logger.debug("Warning, log loss cannot be computed") else: logger.debug(e) if trainable is None: logger.debug( "Error {} with uncreatable pipeline with parameters:{}".format( e, lale.pretty_print.hyperparams_to_string(params) ) ) else: logger.debug( "Error {} with pipeline:{}".format(e, trainable.to_json()) ) raise e return cv_score, logloss, execution_time def merge_trials(trials1, trials2): max_tid = max([trial["tid"] for trial in trials1.trials]) for trial in trials2: tid = trial["tid"] + max_tid + 1 hyperopt_trial = hyperopt.Trials().new_trial_docs( tids=[None], specs=[None], results=[None], miscs=[None] ) hyperopt_trial[0] = trial hyperopt_trial[0]["tid"] = tid hyperopt_trial[0]["misc"]["tid"] = tid for key in hyperopt_trial[0]["misc"]["idxs"].keys(): hyperopt_trial[0]["misc"]["idxs"][key] = [tid] trials1.insert_trial_docs(hyperopt_trial) trials1.refresh() return trials1 def proc_train_test(params, X_train, y_train, return_dict): return_dict["params"] = copy.deepcopy(params) try: score, logloss, execution_time = hyperopt_train_test( params, X_train=X_train, y_train=y_train ) return_dict["loss"] = self.best_score - score return_dict["time"] = execution_time return_dict["log_loss"] = logloss return_dict["status"] = hyperopt.STATUS_OK except BaseException as e: exception_type = f"{type(e).__module__}.{type(e).__name__}" try: trainable = create_instance_from_hyperopt_search_space( self.estimator, params ) if trainable is None: trial_info = f"hyperparams: {params}" else: trial_info = f'pipeline: """{trainable.pretty_print(show_imports=False)}"""' except BaseException: trial_info = f"hyperparams: {params}" error_msg = f"Exception caught in Hyperopt: {exception_type}, {traceback.format_exc()}with {trial_info}" logger.warning(error_msg + ", setting status to FAIL") return_dict["status"] = hyperopt.STATUS_FAIL return_dict["error_msg"] = error_msg if self.verbose: print(return_dict["error_msg"]) def get_final_trained_estimator(params, X_train, y_train): warnings.filterwarnings("ignore") trainable = create_instance_from_hyperopt_search_space( self.estimator, params ) if trainable is None: return None else: trained = trainable.fit(X_train, y_train, **fit_params) return trained def f(params): current_time = time.time() if (self.max_opt_time is not None) and ( (current_time - opt_start_time) > self.max_opt_time ): # if max optimization time set, and we have crossed it, exit optimization completely sys.exit(0) if self.max_eval_time: # Run hyperopt in a subprocess that can be interupted manager = multiprocessing.Manager() proc_dict: Dict[str, Any] = manager.dict() p = multiprocessing.Process( target=proc_train_test, args=(params, X_train, y_train, proc_dict) ) p.start() p.join(self.max_eval_time) if p.is_alive(): p.terminate() p.join() logger.warning( f"Maximum alloted evaluation time exceeded. with hyperparams: {params}, setting status to FAIL" ) proc_dict["status"] = hyperopt.STATUS_FAIL if "status" not in proc_dict: logger.warning("Corrupted results, setting status to FAIL") proc_dict["status"] = hyperopt.STATUS_FAIL else: proc_dict = {} proc_train_test(params, X_train, y_train, proc_dict) return proc_dict algo = getattr(hyperopt, self.algo) # Search in the search space with defaults if self.evals_with_defaults > 0: try: hyperopt.fmin( f, self.search_space_with_defaults, algo=algo.suggest, max_evals=self.evals_with_defaults, trials=self._default_trials, rstate=np.random.RandomState(SEED), show_progressbar=self.show_progressbar, ) except SystemExit: logger.warning( "Maximum alloted optimization time exceeded. Optimization exited prematurely" ) except AllTrialsFailed: self._best_estimator = None if hyperopt.STATUS_OK not in self._trials.statuses(): raise ValueError( "Error from hyperopt, none of the trials succeeded." ) try: hyperopt.fmin( f, self.search_space, algo=algo.suggest, max_evals=self.max_evals - self.evals_with_defaults, trials=self._trials, rstate=np.random.RandomState(SEED), show_progressbar=self.show_progressbar, ) except SystemExit: logger.warning( "Maximum alloted optimization time exceeded. Optimization exited prematurely" ) except AllTrialsFailed: self._best_estimator = None if hyperopt.STATUS_OK not in self._trials.statuses(): self._summarize_statuses() raise ValueError("Error from hyperopt, none of the trials succeeded.") self._trials = merge_trials(self._trials, self._default_trials) if self.show_progressbar: self._summarize_statuses() try: best_trial = self._trials.best_trial val_loss = self._trials.best_trial["result"]["loss"] if len(self._default_trials) > 0: default_val_loss = self._default_trials.best_trial["result"]["loss"] if default_val_loss < val_loss: best_trial = self._default_trials.best_trial best_params = best_trial["result"]["params"] logger.info( "best score: {:.1%}\nbest hyperparams found using {} hyperopt trials: {}".format( self.best_score - self._trials.average_best_error(), self.max_evals, best_params, ) ) trained = get_final_trained_estimator(best_params, X_train, y_train) self._best_estimator = trained except BaseException as e: logger.warning( "Unable to extract the best parameters from optimization, the error: {}".format( e ) ) self._best_estimator = None return self def predict(self, X_eval, **predict_params): import warnings warnings.filterwarnings("ignore") if self._best_estimator is None: raise ValueError( "Can not predict as the best estimator is None. Either an attempt to call `predict` " "before calling `fit` or all the trials during `fit` failed." ) trained = self._best_estimator try: predictions = trained.predict(X_eval, **predict_params) except ValueError as e: logger.warning( "ValueError in predicting using Hyperopt:{}, the error is:{}".format( trained, e ) ) predictions = None return predictions def summary(self): """Table summarizing the trial results (ID, loss, time, log_loss, status). Returns ------- result : DataFrame""" def make_record(trial_dict): return { "name": f'p{trial_dict["tid"]}', "tid": trial_dict["tid"], "loss": trial_dict["result"].get("loss", float("nan")), "time": trial_dict["result"].get("time", float("nan")), "log_loss": trial_dict["result"].get("log_loss", float("nan")), "status": trial_dict["result"]["status"], } records = [make_record(td) for td in self._trials.trials] result = pd.DataFrame.from_records(records, index="name") return result def get_pipeline(self, pipeline_name=None, astype="lale"): """Retrieve one of the trials. Parameters ---------- pipeline_name : union type, default None - string Key for table returned by summary(), return a trainable pipeline. - None When not specified, return the best trained pipeline found. astype : 'lale' or 'sklearn', default 'lale' Type of resulting pipeline. Returns ------- result : Trained operator if best, trainable operator otherwise.""" best_name = None if self._best_estimator is not None: best_name = f'p{self._trials.best_trial["tid"]}' if pipeline_name is None: pipeline_name = best_name if pipeline_name == best_name: result = getattr(self, "_best_estimator", None) else: assert pipeline_name is not None tid = int(pipeline_name[1:]) params = self._trials.trials[tid]["result"]["params"] result = create_instance_from_hyperopt_search_space(self.estimator, params) if result is None or astype == "lale": return result assert astype == "sklearn", astype return result.export_to_sklearn_pipeline()