示例#1
0
class _NSGA2Impl:
    def __init__(
        self,
        estimator=None,
        scoring=None,
        best_score=0.0,
        cv=5,
        max_evals=50,
        max_opt_time=None,
        population_size=10,
        random_seed=42,
    ):
        if estimator is None:
            self.model = LogisticRegression()
        else:
            self.model = estimator

        assert isinstance(self.model, lale.operators.IndividualOp), (
            "Multi-objective optimization is supported for only "
            "Individual Operators currently and not supported over Pipelines.")
        logger.info(
            f"Optimizing model {self.model} with type {type(self.model)}")
        logger.info("Lale param ranges - \n"
                    f"{self.model.get_param_ranges()}")
        self.model_helper = _ModelHelper(self.model)
        self.moo_solutions = []

        self.scoring = scoring
        assert self.scoring is not None, "scoring parameter not specified."
        assert len(
            self.scoring) >= 2, "Less than two scorers specified in scoring"

        if isinstance(best_score, list):
            if len(best_score) < len(scoring):
                best_score.extend([0.0] * (len(scoring) - len(best_score)))
            self.best_score = best_score
        else:
            self.best_score = [best_score] * len(scoring)

        self.cv = cv
        self.max_evals = max_evals
        self.max_opt_time = max_opt_time
        self.population_size = population_size
        self.random_seed = random_seed

    @classmethod
    def validate_hyperparams(cls, scoring=None, best_score=0, **hyperparams):
        check_scoring_best_score_constraint(scoring, best_score)

    # Internal class
    class Soln(object):
        def __init__(self, variables, objectives):
            self.variables = variables
            self.objectives = objectives

    # convert parameter list to dictionary
    def param_to_dict(self, parameter, param_choices, param_categories,
                      param_type):
        temp = {}
        i = 0
        for key in param_choices:
            if key not in param_categories.keys(
            ):  # if non-categorical parameter
                if param_type[key] == "boolean":
                    temp[key] = parameter[i][0]
                else:
                    temp[key] = parameter[i]
            else:
                temp[key] = param_categories[key][parameter[i]]

            i += 1

        return temp

    def fit(self, X, y):

        opt_start_time = time.time()
        kfold = None
        if isinstance(self.cv, int) and self.cv == 1:
            X_train, X_val, y_train, y_val = train_test_split(
                X, y, test_size=0.2, random_state=self.random_seed, stratify=y)
            logger.info("Not using Cross-Validation. "
                        "Performing single train/test split")
        else:
            is_clf = self.model.is_classifier()
            kfold = check_cv(self.cv, y=y, classifier=is_clf)
            # kfold = StratifiedKFold(
            #    n_splits=self.cv, random_state=self.random_seed, shuffle=True
            # )
            logger.info(f"Using Cross-Validation - {kfold}")

        self.ind = 0

        def train_test_model(parameter):
            # First check if we exceeded allocated time budget
            current_time = time.time()
            elapsed_time = current_time - opt_start_time
            if (self.max_opt_time
                    is not None) and (elapsed_time > self.max_opt_time):
                msg = (
                    f"Max optimization time exceeded. "
                    f"Max Opt time = {self.max_opt_time}, Elapsed Time = {elapsed_time}, "
                    f"NFE Completed - {self.ind}")
                raise MaxBudgetExceededException(msg)

            self.ind = self.ind + 1
            logger.info(f"Training population {self.ind}")

            parameter = self.param_to_dict(
                parameter,
                self.model_helper.param_choices,
                self.model_helper.param_categories,
                self.model_helper.param_type,
            )

            scorers = [get_scorer(scorer) for scorer in self.scoring]
            nscorers = len(scorers)

            try:
                if kfold is None:
                    clf = self.model_helper.create_instance(parameter)
                    clf_trained = clf.fit(X_train, y_train)

                    obj_val = [
                        scorer(clf_trained, X_val, y_val) for scorer in scorers
                    ]

                else:

                    obj_scores = [[] for _ in range(nscorers)]

                    # Perform k-fold cross-validation
                    for train_index, test_index in kfold.split(X, y):
                        if isinstance(X, pd.DataFrame):
                            X_train_split, X_val_split = (
                                X.iloc[train_index],
                                X.iloc[test_index],
                            )
                            y_train_split, y_val_split = (
                                y.iloc[train_index],
                                y.iloc[test_index],
                            )
                        else:
                            X_train_split, X_val_split = X[train_index], X[
                                test_index]
                            y_train_split, y_val_split = y[train_index], y[
                                test_index]

                        clf = self.model_helper.create_instance(parameter)
                        clf_trained = clf.fit(X_train_split, y_train_split)

                        obj_score = [
                            scorer(clf_trained, X_val_split, y_val_split)
                            for scorer in scorers
                        ]
                        for i in range(nscorers):
                            obj_scores[i].append(obj_score[i])

                    # Aggregate CV score
                    obj_val = [np.mean(obj_scores[i]) for i in range(nscorers)]
                    logger.debug(f"Obj k-fold scores - {obj_scores}")

                # By default we are solving a minimization MOO problem
                fitnessValue = [
                    self.best_score[i] - obj_val[i] for i in range(nscorers)
                ]
                logger.info(f"Train fitnessValue - {fitnessValue}")

            except jsonschema.ValidationError as e:
                logger.error(f"Caught JSON schema validation error.\n{e}")
                logger.error("Setting fitness (loss) values to infinity")
                fitnessValue = [np.inf for i in range(nscorers)]
                logger.info(f"Train fitnessValue - {fitnessValue}")

            return fitnessValue

        def time_check_callback(alg):
            current_time = time.time()
            elapsed_time = current_time - opt_start_time
            logger.info(
                f"NFE Complete - {alg.nfe}, Elapsed Time - {elapsed_time}")

        parameter_num = len(self.model_helper.param_choices)
        target_num = len(self.scoring)
        # Adjust max_evals if not a multiple of population size. This is
        # required as Platypus performs evaluations in multiples of
        # population_size.
        adjusted_max_evals = (self.max_evals //
                              self.population_size) * self.population_size
        if adjusted_max_evals != self.max_evals:
            logger.info(
                f"Adjusting max_evals to {adjusted_max_evals} from specified {self.max_evals}"
            )

        problem = Problem(parameter_num, target_num)
        problem.types[:] = self.model_helper.types
        problem.function = train_test_model

        # Set the variator based on types of decision variables
        varg = {}
        first_type = problem.types[0].__class__
        all_type_same = all([isinstance(t, first_type) for t in problem.types])
        # use compound operator for mixed types
        if not all_type_same:
            varg["variator"] = CompoundOperator(SBX(), HUX(), PM(), BitFlip())

        algorithm = NSGAII(
            problem,
            population_size=self.population_size,
            **varg,
        )

        try:
            algorithm.run(adjusted_max_evals, callback=time_check_callback)
        except MaxBudgetExceededException as e:
            logger.warning(
                f"Max optimization time budget exceeded. Optimization exited prematurely.\n{e}"
            )

        solutions = nondominated(algorithm.result)
        # solutions = [s for s in algorithm.result if s.feasible]`
        # solutions = algorithm.result

        moo_solutions = []
        for solution in solutions:
            vars = []
            for pnum in range(parameter_num):
                vars.append(problem.types[pnum].decode(
                    solution.variables[pnum]))

            vars_dict = self.param_to_dict(
                vars,
                self.model_helper.param_choices,
                self.model_helper.param_categories,
                self.model_helper.param_type,
            )
            moo_solutions.append(self.Soln(vars_dict, solution.objectives))
            logger.info(f"{vars}, {solution.objectives}")

        self.moo_solutions = moo_solutions

        pareto_models = []
        for solution in self.moo_solutions:
            est = self.model_helper.create_instance(solution.variables)
            est_trained = est.fit(X, y)
            pareto_models.append((solution.variables, est_trained))

        self.pareto_models = pareto_models
        return self

    def get_pareto_solutions(self):
        return self.moo_solutions

    def get_pareto_models(self):
        return self.pareto_models

    # Predict using first pareto-optimal estimator
    def predict(self, X, **kwargs):
        if "pipeline_name" in kwargs:
            pname = kwargs["pipeline_name"]
            pipeline = self.get_pipeline(pipeline_name=pname)
            del kwargs["pipeline_name"]
        else:
            pipeline = self.get_pipeline()

        return pipeline.predict(X, **kwargs)

    # Return pareto-optimal estimator
    def get_pipeline(self, pipeline_name=None, astype="lale"):
        """Retrieve one of the pareto-optimal pipelines.

        Parameters
        ----------
        pipeline_name : union type, default None

            - string
                Key (name) from the table returned by summary(), return a trained pipeline.

            - None
                When not specified, return the first (trained) pipeline in the table
                returned by summary()

        astype : 'lale' or 'sklearn', default 'lale'
            Type of resulting pipeline.

        Returns
        -------
        result : Trained operator."""

        id = 0
        if pipeline_name is not None:
            id = int(pipeline_name[1:])

        assert 0 < len(self.pareto_models), "No pipelines found"
        assert id < len(self.pareto_models), "Invalid pipeline name"
        vars, pareto_model = self.pareto_models[id]
        result = pareto_model

        if astype == "lale":
            return result

        assert astype == "sklearn", "Invalid astype " + astype
        if hasattr(result, "export_to_sklearn_pipeline"):
            result = result.export_to_sklearn_pipeline()
        else:
            logger.warning("Cannot return sklearn pipeline.")

        return result

    def summary(self):
        """Table displaying the pareto-optimal solutions (pipelines)
           obtained after multi-objective optimization
           (name, ID, loss for each specified scorer).

        Returns
        -------
        result : DataFrame"""

        nsolutions = len(self.moo_solutions)
        nscoring = len(self.scoring)

        records = []

        for isol in range(nsolutions):
            record_dict = {}
            record_dict["name"] = f"p{isol}"
            record_dict["id"] = isol
            for iobj in range(nscoring):
                solution = self.moo_solutions[isol]
                record_dict[f"loss{iobj+1}"] = solution.objectives[iobj]

            records.append(record_dict)

        result = pd.DataFrame.from_records(records, index="name")
        return result
示例#2
0
class HyperoptImpl:

    def __init__(self, estimator=None, max_evals=50, frac_evals_with_defaults = 0, algo='tpe',
                 cv=5, handle_cv_failure=False,
                 scoring='accuracy', best_score=0.0,
                 max_opt_time=None, max_eval_time=None, pgo:Optional[PGO]=None,
                 show_progressbar=True, args_to_scorer=None, verbose=False):
        self.max_evals = max_evals
        if estimator is None:
            self.estimator = LogisticRegression()
        else:
            self.estimator = estimator
        if frac_evals_with_defaults > 0:
            self.evals_with_defaults = int(frac_evals_with_defaults*max_evals)
        else:
            self.evals_with_defaults = 0
        self.algo = algo
        self.scoring = scoring
        self.best_score = best_score
        self.handle_cv_failure = handle_cv_failure
        self.cv = cv
        self._trials = hyperopt.Trials()
        self._default_trials = hyperopt.Trials() 
        self.max_opt_time = max_opt_time
        self.max_eval_time = max_eval_time
        self.pgo = pgo
        self.show_progressbar = show_progressbar
        if args_to_scorer is not None:
            self.args_to_scorer = args_to_scorer
        else:
            self.args_to_scorer = {}
        self.verbose = verbose

    def fit(self, X_train, y_train):
        opt_start_time = time.time()
        is_clf = self.estimator.is_classifier()
        self.cv = check_cv(self.cv, y = y_train, classifier=is_clf)
        data_schema = lale.helpers.fold_schema(
            X_train, y_train, self.cv, is_clf)
        self.search_space = hyperopt.hp.choice(
            'meta_model', [hyperopt_search_space(self.estimator, pgo=self.pgo,
                                                 data_schema=data_schema)])
        #Create a search space with default hyperparameters for all trainable parts of the pipeline. 
        #This search space is used for `frac_evals_with_defaults` fraction of the total trials. 
        try:
            self.search_space_with_defaults = hyperopt.hp.choice('meta_model', 
                                                [hyperopt_search_space(self.estimator.freeze_trainable(), 
                                                pgo=self.pgo, data_schema=data_schema)])
        except:
            logger.warning(f"Exception caught during generation of default search space, setting frac_evals_with_defaults to zero.")
            self.evals_with_defaults = 0

        def hyperopt_train_test(params, X_train, y_train):
            warnings.filterwarnings("ignore")

            trainable = create_instance_from_hyperopt_search_space(self.estimator, params)
            try:
                cv_score, logloss, execution_time = cross_val_score_track_trials(trainable, X_train, y_train, cv=self.cv, scoring=self.scoring, args_to_scorer=self.args_to_scorer)
                logger.debug("Successful trial of hyperopt with hyperparameters:{}".format(params))
            except BaseException as e:
                #If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion
                if self.handle_cv_failure:
                    X_train_part, X_validation, y_train_part, y_validation = train_test_split(X_train, y_train, test_size=0.20)
                    start = time.time()
                    trained = trainable.fit(X_train_part, y_train_part)
                    scorer = check_scoring(trainable, scoring=self.scoring)
                    cv_score  = scorer(trained, X_validation, y_validation, **self.args_to_scorer)
                    execution_time = time.time() - start
                    y_pred_proba = trained.predict_proba(X_validation)
                    try:
                        logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba)
                    except BaseException:
                        logloss = 0
                        logger.debug("Warning, log loss cannot be computed")
                else:
                    logger.debug(e)
                    logger.debug("Error {} with pipeline:{}".format(e, trainable.to_json()))
                    raise e
            return cv_score, logloss, execution_time

        def merge_trials(trials1, trials2):
            max_tid = max([trial['tid'] for trial in trials1.trials])

            for trial in trials2:
                tid = trial['tid'] + max_tid + 1
                hyperopt_trial = hyperopt.Trials().new_trial_docs(
                        tids=[None],
                        specs=[None],
                        results=[None],
                        miscs=[None])
                hyperopt_trial[0] = trial
                hyperopt_trial[0]['tid'] = tid
                hyperopt_trial[0]['misc']['tid'] = tid
                for key in hyperopt_trial[0]['misc']['idxs'].keys():
                    hyperopt_trial[0]['misc']['idxs'][key] = [tid]
                trials1.insert_trial_docs(hyperopt_trial) 
                trials1.refresh()
            return trials1
            
        def proc_train_test(params, X_train, y_train, return_dict):
            return_dict['params'] = copy.deepcopy(params)
            try:
                score, logloss, execution_time = hyperopt_train_test(params, X_train=X_train, y_train=y_train)
                return_dict['loss'] = self.best_score - score
                return_dict['time'] = execution_time
                return_dict['log_loss'] = logloss
                return_dict['status'] = hyperopt.STATUS_OK
            except BaseException as e:
                logger.warning(f"Exception caught in Hyperopt:{type(e)}, {traceback.format_exc()} with hyperparams: {params}, setting status to FAIL")
                return_dict['status'] = hyperopt.STATUS_FAIL
                return_dict['error_msg'] = f"Exception caught in Hyperopt:{type(e)}, {traceback.format_exc()} with hyperparams: {params}"
                if self.verbose:
                    print(return_dict['error_msg'])

        def get_final_trained_estimator(params, X_train, y_train):
            warnings.filterwarnings("ignore")
            trainable = create_instance_from_hyperopt_search_space(self.estimator, params)
            trained = trainable.fit(X_train, y_train)
            return trained

        def f(params):
            current_time = time.time()
            if (self.max_opt_time is not None) and ((current_time - opt_start_time) > self.max_opt_time) :
                # if max optimization time set, and we have crossed it, exit optimization completely
                sys.exit(0)
            if self.max_eval_time:
                # Run hyperopt in a subprocess that can be interupted
                manager = multiprocessing.Manager()
                proc_dict = manager.dict()
                p = multiprocessing.Process(
                    target=proc_train_test,
                    args=(params, X_train, y_train, proc_dict))
                p.start()
                p.join(self.max_eval_time)
                if p.is_alive():
                    p.terminate()
                    p.join()
                    logger.warning(f"Maximum alloted evaluation time exceeded. with hyperparams: {params}, setting status to FAIL")
                    proc_dict['status'] = hyperopt.STATUS_FAIL
                if 'status' not in proc_dict:
                    logger.warning(f"Corrupted results, setting status to FAIL")
                    proc_dict['status'] = hyperopt.STATUS_FAIL
            else:
                proc_dict = {}
                proc_train_test(params, X_train, y_train, proc_dict)
            return proc_dict

        algo = getattr(hyperopt, self.algo)
        #Search in the search space with defaults
        if self.evals_with_defaults > 0:
            try:
                hyperopt.fmin(f, self.search_space_with_defaults, algo=algo.suggest, max_evals=self.evals_with_defaults, trials=self._default_trials, rstate=np.random.RandomState(SEED),
                show_progressbar=self.show_progressbar)
            except SystemExit :
                logger.warning('Maximum alloted optimization time exceeded. Optimization exited prematurely')
            except AllTrialsFailed:
                self._best_estimator = None
                if hyperopt.STATUS_OK not in self._trials.statuses():
                    raise ValueError('Error from hyperopt, none of the trials succeeded.')

        try :
            hyperopt.fmin(f, self.search_space, algo=algo.suggest, max_evals=self.max_evals-self.evals_with_defaults, trials=self._trials, rstate=np.random.RandomState(SEED),
            show_progressbar=self.show_progressbar)
        except SystemExit :
            logger.warning('Maximum alloted optimization time exceeded. Optimization exited prematurely')
        except AllTrialsFailed:
            self._best_estimator = None
            if hyperopt.STATUS_OK not in self._trials.statuses():
                raise ValueError('Error from hyperopt, none of the trials succeeded.')

        self._trials = merge_trials(self._trials, self._default_trials)
        try :
            best_trial = self._trials.best_trial
            val_loss = self._trials.best_trial['result']['loss']
            if len(self._default_trials) > 0:
                default_val_loss = self._default_trials.best_trial['result']['loss']
                if default_val_loss < val_loss:
                    best_trial = self._default_trials.best_trial
            best_params = best_trial['result']['params']            
            logger.info(
                'best score: {:.1%}\nbest hyperparams found using {} hyperopt trials: {}'.format(
                    self.best_score - self._trials.average_best_error(), self.max_evals, best_params
                )
            )
            trained = get_final_trained_estimator(best_params, X_train, y_train)
            self._best_estimator = trained
        except BaseException as e :
            logger.warning('Unable to extract the best parameters from optimization, the error: {}'.format(e))
            self._best_estimator = None

        return self

    def predict(self, X_eval):
        import warnings
        warnings.filterwarnings("ignore")
        if self._best_estimator is None:
            raise ValueError("Can not predict as the best estimator is None. Either an attempt to call `predict` "
        "before calling `fit` or all the trials during `fit` failed.")
        trained = self._best_estimator
        try:
            predictions = trained.predict(X_eval)
        except ValueError as e:
            logger.warning("ValueError in predicting using Hyperopt:{}, the error is:{}".format(trained, e))
            predictions = None

        return predictions

    def summary(self):
        """Table summarizing the trial results (ID, loss, time, log_loss, status).

Returns
-------
result : DataFrame"""
        def make_record(trial_dict):
            try:
                loss = trial_dict['result']['loss']
            except BaseException:
                loss = np.nan
            try:
                time = trial_dict['result']['time']
            except BaseException:
                time = '-'
            try:
                log_loss = trial_dict['result']['log_loss']
            except BaseException:
                log_loss = np.nan

            return {
                'name': f'p{trial_dict["tid"]}',
                'tid': trial_dict['tid'],
                'loss': trial_dict['result'].get('loss', float('nan')),
                'time': trial_dict['result'].get('time', float('nan')),
                'log_loss': trial_dict['result'].get('log_loss', float('nan')),
                'status': trial_dict['result']['status']}
        records = [make_record(td) for td in self._trials.trials]
        result = pd.DataFrame.from_records(records, index='name')
        return result

    def get_pipeline(self, pipeline_name=None, astype='lale'):
        """Retrieve one of the trials.

Parameters
----------
pipeline_name : union type, default None

    - string
        Key for table returned by summary(), return a trainable pipeline.

    - None
        When not specified, return the best trained pipeline found.

astype : 'lale' or 'sklearn', default 'lale'
    Type of resulting pipeline.

Returns
-------
result : Trained operator if best, trainable operator otherwise.
"""
        if pipeline_name is None:
            result = getattr(self, '_best_estimator', None)
        else:
            tid = int(pipeline_name[1:])
            params = self._trials.trials[tid]['result']['params']
            result = create_instance_from_hyperopt_search_space(
                self.estimator, params)
        if result is None or astype == 'lale':
            return result
        assert astype == 'sklearn', astype
        return result.export_to_sklearn_pipeline()
示例#3
0
文件: smac.py 项目: shinnar/lale
class _SMACImpl:
    def __init__(
        self,
        estimator=None,
        max_evals=50,
        cv=5,
        handle_cv_failure=False,
        scoring=None,
        best_score=0.0,
        max_opt_time=None,
        lale_num_grids=None,
    ):
        assert smac_installed, """Your Python environment does not have smac installed. You can install it with
    pip install smac<=0.10.0
or with
    pip install 'lale[full]'"""
        self.max_evals = max_evals
        if estimator is None:
            self.estimator = LogisticRegression()
        else:
            self.estimator = estimator

        self.scoring = scoring
        if self.scoring is None:
            is_clf = self.estimator.is_classifier()
            if is_clf:
                self.scoring = "accuracy"
            else:
                self.scoring = "r2"

        self.best_score = best_score
        self.handle_cv_failure = handle_cv_failure
        self.cv = cv
        self.max_opt_time = max_opt_time
        self.lale_num_grids = lale_num_grids
        self.trials = None

    def fit(self, X_train, y_train):
        data_schema = lale.helpers.fold_schema(X_train, y_train, self.cv,
                                               self.estimator.is_classifier())
        self.search_space: ConfigurationSpace = get_smac_space(
            self.estimator,
            lale_num_grids=self.lale_num_grids,
            data_schema=data_schema)
        # Scenario object
        scenario_options = {
            "run_obj": "quality",  # optimize quality (alternatively runtime)
            "runcount-limit": self.max_evals,  # maximum function evaluations
            "cs": self.search_space,  # configuration space
            "deterministic": "true",
            "abort_on_first_run_crash": False,
        }
        if self.max_opt_time is not None:
            scenario_options["wallclock_limit"] = self.max_opt_time
        self.scenario = Scenario(scenario_options)

        self.cv = check_cv(self.cv,
                           y=y_train,
                           classifier=self.estimator.is_classifier())

        def smac_train_test(trainable, X_train, y_train):
            try:
                cv_score, logloss, execution_time = cross_val_score_track_trials(
                    trainable,
                    X_train,
                    y_train,
                    cv=self.cv,
                    scoring=self.scoring)
                logger.debug("Successful trial of SMAC")
            except BaseException as e:
                # If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion
                if self.handle_cv_failure:
                    (
                        X_train_part,
                        X_validation,
                        y_train_part,
                        y_validation,
                    ) = train_test_split(X_train, y_train, test_size=0.20)
                    start = time.time()
                    trained = trainable.fit(X_train_part, y_train_part)
                    scorer = check_scoring(trainable, scoring=self.scoring)
                    cv_score = scorer(trained, X_validation, y_validation)
                    execution_time = time.time() - start
                    y_pred_proba = trained.predict_proba(X_validation)
                    try:
                        logloss = log_loss(y_true=y_validation,
                                           y_pred=y_pred_proba)
                    except BaseException:
                        logloss = 0
                        logger.debug("Warning, log loss cannot be computed")
                else:
                    logger.debug("Error {} with pipeline:{}".format(
                        e, trainable.to_json()))
                    raise e
            return cv_score, logloss, execution_time

        def f(trainable):
            return_dict = {}
            try:
                score, logloss, execution_time = smac_train_test(
                    trainable, X_train=X_train, y_train=y_train)
                return_dict = {
                    "loss": self.best_score - score,
                    "time": execution_time,
                    "log_loss": logloss,
                }
            except BaseException as e:
                logger.warning(
                    f"Exception caught in SMACCV:{type(e)}, {traceback.format_exc()}, SMAC will set a cost_for_crash to MAXINT."
                )
                raise e
            return return_dict["loss"]

        try:
            smac = orig_SMAC(
                scenario=self.scenario,
                rng=np.random.RandomState(42),
                tae_runner=lale_op_smac_tae(self.estimator, f),
            )
            incumbent = smac.optimize()
            self.trials = smac.get_runhistory()
            trainable = lale_trainable_op_from_config(self.estimator,
                                                      incumbent)
            # get the trainable corresponding to the best params and train it on the entire training dataset.
            trained = trainable.fit(X_train, y_train)
            self._best_estimator = trained
        except BudgetExhaustedException:
            logger.warning(
                "Maximum alloted optimization time exceeded. Optimization exited prematurely"
            )
        except BaseException as e:
            logger.warning("Error during optimization: {}".format(e))
            self._best_estimator = None

        return self

    def predict(self, X_eval):
        import warnings

        warnings.filterwarnings("ignore")
        trained = self._best_estimator
        if trained is None:
            logger.warning(
                "Could not get trained best estimator when predicting using SMACCV:{}, the error is"
            )
            return None

        try:
            predictions = trained.predict(X_eval)
        except ValueError as e:
            logger.warning(
                "ValueError in predicting using SMACCV:{}, the error is:{}".
                format(trained, e))
            predictions = None

        return predictions

    def get_trials(self):
        """Returns the trials i.e. RunHistory object.

        Returns
        -------
        smac.runhistory.runhistory.RunHistory
            RunHistory of all the trials executed during the optimization i.e. fit method of SMACCV.
        """
        return self.trials

    def get_pipeline(self, pipeline_name=None, astype="lale"):
        if pipeline_name is not None:
            raise NotImplementedError("Cannot get pipeline by name yet.")
        result = getattr(self, "_best_estimator", None)
        if result is None or astype == "lale":
            return result
        assert astype == "sklearn", astype
        # TODO: should this try and return an actual sklearn pipeline?
        return result
示例#4
0
class _HyperoptImpl:
    def __init__(
        self,
        *,
        estimator=None,
        scoring=None,
        best_score=0.0,
        args_to_scorer=None,
        cv=5,
        handle_cv_failure=False,
        verbose=False,
        show_progressbar=True,
        algo="tpe",
        max_evals=50,
        frac_evals_with_defaults=0,
        max_opt_time=None,
        max_eval_time=None,
        pgo: Optional[PGO] = None,
    ):
        self.max_evals = max_evals
        if estimator is None:
            self.estimator = LogisticRegression()
        else:
            self.estimator = estimator
        if frac_evals_with_defaults > 0:
            self.evals_with_defaults = int(frac_evals_with_defaults * max_evals)
        else:
            self.evals_with_defaults = 0
        self.algo = algo
        self.scoring = scoring
        if self.scoring is None:
            is_clf = self.estimator.is_classifier()
            if is_clf:
                self.scoring = "accuracy"
            else:
                self.scoring = "r2"
        self.best_score = best_score
        self.handle_cv_failure = handle_cv_failure
        self.cv = cv
        self._trials = hyperopt.Trials()
        self._default_trials = hyperopt.Trials()
        self.max_opt_time = max_opt_time
        self.max_eval_time = max_eval_time
        self.pgo = pgo
        self.show_progressbar = show_progressbar
        if args_to_scorer is not None:
            self.args_to_scorer = args_to_scorer
        else:
            self.args_to_scorer = {}
        self.verbose = verbose

    def _summarize_statuses(self):
        status_list = self._trials.statuses()
        status_hist = {}
        for status in status_list:
            status_hist[status] = 1 + status_hist.get(status, 0)
        if hyperopt.STATUS_FAIL in status_hist:
            print(
                f"{status_hist[hyperopt.STATUS_FAIL]} out of {len(status_list)} trials failed, call summary() for details."
            )
            if not self.verbose:
                print("Run with verbose=True to see per-trial exceptions.")

    def fit(self, X_train, y_train, **fit_params):
        opt_start_time = time.time()
        is_clf = self.estimator.is_classifier()
        self.cv = check_cv(self.cv, y=y_train, classifier=is_clf)
        data_schema = lale.helpers.fold_schema(X_train, y_train, self.cv, is_clf)
        self.search_space = hyperopt.hp.choice(
            "meta_model",
            [
                hyperopt_search_space(
                    self.estimator, pgo=self.pgo, data_schema=data_schema
                )
            ],
        )
        # Create a search space with default hyperparameters for all trainable parts of the pipeline.
        # This search space is used for `frac_evals_with_defaults` fraction of the total trials.
        try:
            self.search_space_with_defaults = hyperopt.hp.choice(
                "meta_model",
                [
                    hyperopt_search_space(
                        self.estimator.freeze_trainable(),
                        pgo=self.pgo,
                        data_schema=data_schema,
                    )
                ],
            )
        except Exception:
            logger.warning(
                "Exception caught during generation of default search space, setting frac_evals_with_defaults to zero."
            )
            self.evals_with_defaults = 0

        def hyperopt_train_test(params, X_train, y_train):
            warnings.filterwarnings("ignore")

            trainable = create_instance_from_hyperopt_search_space(
                self.estimator, params
            )
            try:
                cv_score, logloss, execution_time = cross_val_score_track_trials(
                    trainable,
                    X_train,
                    y_train,
                    cv=self.cv,
                    scoring=self.scoring,
                    args_to_scorer=self.args_to_scorer,
                )
                logger.debug(
                    "Successful trial of hyperopt with hyperparameters:{}".format(
                        params
                    )
                )
            except BaseException as e:
                # If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion
                if self.handle_cv_failure and trainable is not None:
                    (
                        X_train_part,
                        X_validation,
                        y_train_part,
                        y_validation,
                    ) = train_test_split(X_train, y_train, test_size=0.20)
                    start = time.time()
                    trained = trainable.fit(X_train_part, y_train_part, **fit_params)
                    scorer = check_scoring(trainable, scoring=self.scoring)
                    cv_score = scorer(
                        trained, X_validation, y_validation, **self.args_to_scorer
                    )
                    execution_time = time.time() - start
                    y_pred_proba = trained.predict_proba(X_validation)
                    try:
                        logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba)
                    except BaseException:
                        logloss = 0
                        logger.debug("Warning, log loss cannot be computed")
                else:
                    logger.debug(e)
                    if trainable is None:
                        logger.debug(
                            "Error {} with uncreatable pipeline with parameters:{}".format(
                                e, lale.pretty_print.hyperparams_to_string(params)
                            )
                        )
                    else:
                        logger.debug(
                            "Error {} with pipeline:{}".format(e, trainable.to_json())
                        )
                    raise e
            return cv_score, logloss, execution_time

        def merge_trials(trials1, trials2):
            max_tid = max([trial["tid"] for trial in trials1.trials])

            for trial in trials2:
                tid = trial["tid"] + max_tid + 1
                hyperopt_trial = hyperopt.Trials().new_trial_docs(
                    tids=[None], specs=[None], results=[None], miscs=[None]
                )
                hyperopt_trial[0] = trial
                hyperopt_trial[0]["tid"] = tid
                hyperopt_trial[0]["misc"]["tid"] = tid
                for key in hyperopt_trial[0]["misc"]["idxs"].keys():
                    hyperopt_trial[0]["misc"]["idxs"][key] = [tid]
                trials1.insert_trial_docs(hyperopt_trial)
                trials1.refresh()
            return trials1

        def proc_train_test(params, X_train, y_train, return_dict):
            return_dict["params"] = copy.deepcopy(params)
            try:
                score, logloss, execution_time = hyperopt_train_test(
                    params, X_train=X_train, y_train=y_train
                )
                return_dict["loss"] = self.best_score - score
                return_dict["time"] = execution_time
                return_dict["log_loss"] = logloss
                return_dict["status"] = hyperopt.STATUS_OK
            except BaseException as e:
                exception_type = f"{type(e).__module__}.{type(e).__name__}"
                try:
                    trainable = create_instance_from_hyperopt_search_space(
                        self.estimator, params
                    )
                    if trainable is None:
                        trial_info = f"hyperparams: {params}"
                    else:
                        trial_info = f'pipeline: """{trainable.pretty_print(show_imports=False)}"""'

                except BaseException:
                    trial_info = f"hyperparams: {params}"
                error_msg = f"Exception caught in Hyperopt: {exception_type}, {traceback.format_exc()}with {trial_info}"
                logger.warning(error_msg + ", setting status to FAIL")
                return_dict["status"] = hyperopt.STATUS_FAIL
                return_dict["error_msg"] = error_msg
                if self.verbose:
                    print(return_dict["error_msg"])

        def get_final_trained_estimator(params, X_train, y_train):
            warnings.filterwarnings("ignore")
            trainable = create_instance_from_hyperopt_search_space(
                self.estimator, params
            )
            if trainable is None:
                return None
            else:
                trained = trainable.fit(X_train, y_train, **fit_params)
                return trained

        def f(params):
            current_time = time.time()
            if (self.max_opt_time is not None) and (
                (current_time - opt_start_time) > self.max_opt_time
            ):
                # if max optimization time set, and we have crossed it, exit optimization completely
                sys.exit(0)
            if self.max_eval_time:
                # Run hyperopt in a subprocess that can be interupted
                manager = multiprocessing.Manager()
                proc_dict: Dict[str, Any] = manager.dict()
                p = multiprocessing.Process(
                    target=proc_train_test, args=(params, X_train, y_train, proc_dict)
                )
                p.start()
                p.join(self.max_eval_time)
                if p.is_alive():
                    p.terminate()
                    p.join()
                    logger.warning(
                        f"Maximum alloted evaluation time exceeded. with hyperparams: {params}, setting status to FAIL"
                    )
                    proc_dict["status"] = hyperopt.STATUS_FAIL
                if "status" not in proc_dict:
                    logger.warning("Corrupted results, setting status to FAIL")
                    proc_dict["status"] = hyperopt.STATUS_FAIL
            else:
                proc_dict = {}
                proc_train_test(params, X_train, y_train, proc_dict)
            return proc_dict

        algo = getattr(hyperopt, self.algo)
        # Search in the search space with defaults
        if self.evals_with_defaults > 0:
            try:
                hyperopt.fmin(
                    f,
                    self.search_space_with_defaults,
                    algo=algo.suggest,
                    max_evals=self.evals_with_defaults,
                    trials=self._default_trials,
                    rstate=np.random.RandomState(SEED),
                    show_progressbar=self.show_progressbar,
                )
            except SystemExit:
                logger.warning(
                    "Maximum alloted optimization time exceeded. Optimization exited prematurely"
                )
            except AllTrialsFailed:
                self._best_estimator = None
                if hyperopt.STATUS_OK not in self._trials.statuses():
                    raise ValueError(
                        "Error from hyperopt, none of the trials succeeded."
                    )

        try:
            hyperopt.fmin(
                f,
                self.search_space,
                algo=algo.suggest,
                max_evals=self.max_evals - self.evals_with_defaults,
                trials=self._trials,
                rstate=np.random.RandomState(SEED),
                show_progressbar=self.show_progressbar,
            )
        except SystemExit:
            logger.warning(
                "Maximum alloted optimization time exceeded. Optimization exited prematurely"
            )
        except AllTrialsFailed:
            self._best_estimator = None
            if hyperopt.STATUS_OK not in self._trials.statuses():
                self._summarize_statuses()
                raise ValueError("Error from hyperopt, none of the trials succeeded.")
        self._trials = merge_trials(self._trials, self._default_trials)
        if self.show_progressbar:
            self._summarize_statuses()
        try:
            best_trial = self._trials.best_trial
            val_loss = self._trials.best_trial["result"]["loss"]
            if len(self._default_trials) > 0:
                default_val_loss = self._default_trials.best_trial["result"]["loss"]
                if default_val_loss < val_loss:
                    best_trial = self._default_trials.best_trial
            best_params = best_trial["result"]["params"]
            logger.info(
                "best score: {:.1%}\nbest hyperparams found using {} hyperopt trials: {}".format(
                    self.best_score - self._trials.average_best_error(),
                    self.max_evals,
                    best_params,
                )
            )
            trained = get_final_trained_estimator(best_params, X_train, y_train)
            self._best_estimator = trained
        except BaseException as e:
            logger.warning(
                "Unable to extract the best parameters from optimization, the error: {}".format(
                    e
                )
            )
            self._best_estimator = None

        return self

    def predict(self, X_eval, **predict_params):
        import warnings

        warnings.filterwarnings("ignore")
        if self._best_estimator is None:
            raise ValueError(
                "Can not predict as the best estimator is None. Either an attempt to call `predict` "
                "before calling `fit` or all the trials during `fit` failed."
            )
        trained = self._best_estimator
        try:
            predictions = trained.predict(X_eval, **predict_params)
        except ValueError as e:
            logger.warning(
                "ValueError in predicting using Hyperopt:{}, the error is:{}".format(
                    trained, e
                )
            )
            predictions = None

        return predictions

    def summary(self):
        """Table summarizing the trial results (ID, loss, time, log_loss, status).

        Returns
        -------
        result : DataFrame"""

        def make_record(trial_dict):
            return {
                "name": f'p{trial_dict["tid"]}',
                "tid": trial_dict["tid"],
                "loss": trial_dict["result"].get("loss", float("nan")),
                "time": trial_dict["result"].get("time", float("nan")),
                "log_loss": trial_dict["result"].get("log_loss", float("nan")),
                "status": trial_dict["result"]["status"],
            }

        records = [make_record(td) for td in self._trials.trials]
        result = pd.DataFrame.from_records(records, index="name")
        return result

    def get_pipeline(self, pipeline_name=None, astype="lale"):
        """Retrieve one of the trials.

        Parameters
        ----------
        pipeline_name : union type, default None

            - string
                Key for table returned by summary(), return a trainable pipeline.

            - None
                When not specified, return the best trained pipeline found.

        astype : 'lale' or 'sklearn', default 'lale'
            Type of resulting pipeline.

        Returns
        -------
        result : Trained operator if best, trainable operator otherwise."""
        best_name = None
        if self._best_estimator is not None:
            best_name = f'p{self._trials.best_trial["tid"]}'
        if pipeline_name is None:
            pipeline_name = best_name
        if pipeline_name == best_name:
            result = getattr(self, "_best_estimator", None)
        else:
            assert pipeline_name is not None
            tid = int(pipeline_name[1:])
            params = self._trials.trials[tid]["result"]["params"]
            result = create_instance_from_hyperopt_search_space(self.estimator, params)
        if result is None or astype == "lale":
            return result
        assert astype == "sklearn", astype
        return result.export_to_sklearn_pipeline()