Exemplo n.º 1
0
    def from_json(self, json_desc):
        self.library_version = json_desc.get("library_version", self.library_version)
        self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name)
        self.algorithm_short_name = json_desc.get(
            "algorithm_short_name", self.algorithm_short_name
        )
        self.uid = json_desc.get("uid", self.uid)
        self.selected_models = []
        models_json = json_desc.get("models")
        for selected in models_json:
            model = selected["model"]
            repeat = selected["repeat"]

            il = ModelFramework(model.get("params"))
            il.from_json(model)
            self.selected_models += [
                # {"model": LearnerFactory.load(model), "repeat": repeat}
                {"model": il, "repeat": repeat}
            ]
Exemplo n.º 2
0
class BaseAutoML(BaseEstimator, ABC):
    """
    Automated Machine Learning for supervised tasks (binary classification, multiclass classification, regression).
    Warning: This class should not be used directly. Use derived classes instead.
    """
    def __init__(self):
        logger.debug("BaseAutoML.__init__")
        self._results_path = None
        self._models = [
        ]  # instances of iterative learner framework or ensemble
        self._best_model = None
        self._verbose = True
        self._threshold = None  # used only in classification
        self._metrics_details = None
        self._max_metrics = None
        self._confusion_matrix = None
        self._X_path, self._y_path = None, None
        self._data_info = None
        self._model_paths = []
        self._stacked_models = None
        self._fit_level = None
        self._start_time = time.time()
        self._time_ctrl = None
        self._all_params = {}
        # https://scikit-learn.org/stable/developers/develop.html#universal-attributes
        self.n_features_in_ = None  # for scikit-learn api

    def _get_tuner_params(self, start_random_models, hill_climbing_steps,
                          top_models_to_improve):
        return {
            "start_random_models": start_random_models,
            "hill_climbing_steps": hill_climbing_steps,
            "top_models_to_improve": top_models_to_improve,
        }

    def _check_can_load(self):
        """ Checks if AutoML can be loaded from a folder"""
        if self.results_path is not None:
            # Dir exists and can be loaded
            if os.path.exists(self.results_path) and os.path.exists(
                    os.path.join(self.results_path, "params.json")):
                self.load(self.results_path)
                self._results_path = self.results_path

    def load(self, path):
        logger.info("Loading AutoML models ...")
        try:
            params = json.load(open(os.path.join(path, "params.json")))

            self._model_paths = params["saved"]
            self._ml_task = params["ml_task"]
            self._eval_metric = params["eval_metric"]
            stacked_models = params.get("stacked")

            models_map = {}
            for model_path in self._model_paths:
                if model_path.endswith("Ensemble") or model_path.endswith(
                        "Ensemble_Stacked"):
                    ens = Ensemble.load(model_path, models_map)
                    self._models += [ens]
                    models_map[ens.get_name()] = ens
                else:
                    m = ModelFramework.load(model_path)
                    self._models += [m]
                    models_map[m.get_name()] = m

            if stacked_models is not None:
                self._stacked_models = []
                for stacked_model_name in stacked_models:
                    self._stacked_models += [models_map[stacked_model_name]]

            best_model_name = None
            with open(os.path.join(path, "best_model.txt"), "r") as fin:
                best_model_name = fin.read()

            self._best_model = models_map[best_model_name]

            data_info_path = os.path.join(path, "data_info.json")
            self._data_info = json.load(open(data_info_path))
            self.n_features_in_ = self._data_info["n_features"]

            if "n_classes" in self._data_info:
                self.n_classes = self._data_info["n_classes"]

            self._fit_level = "finished"
        except Exception as e:
            raise AutoMLException(f"Cannot load AutoML directory. {str(e)}")

    def get_leaderboard(self):
        ldb = {
            "name": [],
            "model_type": [],
            "metric_type": [],
            "metric_value": [],
            "train_time": [],
        }
        for m in self._models:
            ldb["name"] += [m.get_name()]
            ldb["model_type"] += [m.get_type()]
            ldb["metric_type"] += [self._eval_metric]
            ldb["metric_value"] += [m.get_final_loss()]
            ldb["train_time"] += [np.round(m.get_train_time(), 2)]
        return pd.DataFrame(ldb)

    def keep_model(self, model, model_path):
        if model is None:
            return
        self._models += [model]
        self._model_paths += [model_path]
        self.select_and_save_best()

        self.verbose_print("{} {} {} trained in {} seconds".format(
            model.get_name(),
            self._eval_metric,
            np.round(model.get_final_loss(), 6),
            np.round(model.get_train_time(), 2),
        ))
        self._time_ctrl.log_time(model.get_name(), model.get_type(),
                                 self._fit_level, model.get_train_time())

    def create_dir(self, model_path):
        if not os.path.exists(model_path):
            try:
                os.mkdir(model_path)
            except Exception as e:
                raise AutoMLException(
                    f"Cannot create directory {model_path}. {str(e)}")

    def train_model(self, params):

        # do we have enough time to train?
        # if not, skip
        if not self._time_ctrl.enough_time(params["learner"]["model_type"],
                                           self._fit_level):
            logger.info(
                f"Cannot train {params['name']} because of the time constraint"
            )
            return False

        # let's create directory to log all training artifacts
        model_path = os.path.join(self._results_path, params["name"])
        self.create_dir(model_path)

        # prepare callbacks
        early_stop = EarlyStopping({
            "metric": {
                "name": self._eval_metric
            },
            "log_to_dir": model_path
        })

        learner_time_constraint = LearnerTimeConstraint({
            "learner_time_limit":
            self._time_ctrl.learner_time_limit(
                params["learner"]["model_type"],
                self._fit_level,
                self._validation_strategy.get("k_folds", 1.0),
            ),
            "min_steps":
            params["additional"].get("min_steps"),
        })

        total_time_constraint = TotalTimeConstraint({
            "total_time_limit":
            self._total_time_limit if self._model_time_limit is None else None,
            "total_time_start":
            self._start_time,
        })

        # create model framework
        mf = ModelFramework(
            params,
            callbacks=[
                early_stop, learner_time_constraint, total_time_constraint
            ],
        )

        # start training
        logger.info(
            f"Train model #{len(self._models)+1} / Model name: {params['name']}"
        )
        mf.train(model_path)

        # save the model
        mf.save(model_path)

        # and keep info about the model
        self.keep_model(mf, model_path)
        return True

    def verbose_print(self, msg):
        if self._verbose > 0:
            # self._progress_bar.write(msg)
            print(msg)

    def ensemble_step(self, is_stacked=False):
        if self._train_ensemble and len(self._models) > 1:

            ensemble_path = os.path.join(
                self._results_path,
                "Ensemble_Stacked" if is_stacked else "Ensemble")
            self.create_dir(ensemble_path)

            self.ensemble = Ensemble(self._eval_metric,
                                     self._ml_task,
                                     is_stacked=is_stacked)
            oofs, target = self.ensemble.get_oof_matrix(self._models)
            self.ensemble.fit(oofs, target)
            self.ensemble.save(ensemble_path)
            self.keep_model(self.ensemble, ensemble_path)
            return True
        return False

    def can_we_stack_them(self, y):
        # if multiclass and too many classes then No
        return True

    def get_stacked_data(self, X, mode="training"):
        # mode can be `training` or `predict`
        if self._stacked_models is None:
            return X
        all_oofs = []
        for m in self._stacked_models:
            oof = None
            if mode == "training":
                oof = m.get_out_of_folds()
            else:
                oof = m.predict(X)
                if self._ml_task == BINARY_CLASSIFICATION:
                    cols = [f for f in oof.columns if "prediction" in f]
                    if len(cols) == 2:
                        oof = pd.DataFrame({"prediction": oof[cols[1]]})

            cols = [f for f in oof.columns if "prediction" in f]
            oof = oof[cols]
            oof.columns = [f"{m.get_name()}_{c}" for c in cols]
            all_oofs += [oof]

        org_index = X.index.copy()
        X.reset_index(drop=True, inplace=True)
        X_stacked = pd.concat(all_oofs + [X], axis=1)

        X_stacked.index = org_index.copy()
        X.index = org_index.copy()
        return X_stacked

    def _perform_model_stacking(self):

        if self._stacked_models is not None:
            return

        ldb = self.get_leaderboard()
        ldb = ldb.sort_values(by="metric_value", ascending=True)

        models_map = {
            m.get_name(): m
            for m in self._models if not m._is_stacked
        }
        self._stacked_models = []
        models_limit = 10

        for model_type in np.unique(ldb.model_type):
            if model_type in ["Baseline"]:
                continue
            ds = ldb[ldb.model_type == model_type].copy()
            ds.sort_values(by="metric_value", inplace=True)

            for n in list(ds.name.iloc[:models_limit].values):
                self._stacked_models += [models_map[n]]

        scores = [m.get_final_loss() for m in self._stacked_models]
        self._stacked_models = [
            self._stacked_models[i] for i in np.argsort(scores).tolist()
        ]

    def prepare_for_stacking(self):
        # print("Stacked models ....")
        # do we have enough models?
        if len(self._models) < 5:
            return
        # do we have time?
        if self._total_time_limit is not None:
            time_left = self._total_time_limit - (time.time() -
                                                  self._start_time)
            # we need at least 60 seconds to do anything
            if time_left < 60:
                return

        self._perform_model_stacking()

        X_stacked_path = os.path.join(self._results_path, "X_stacked.parquet")
        if os.path.exists(X_stacked_path):
            return

        X = pd.read_parquet(self._X_path)
        org_columns = X.columns.tolist()
        X_stacked = self.get_stacked_data(X)
        new_columns = X_stacked.columns.tolist()
        added_columns = [c for c in new_columns if c not in org_columns]

        # save stacked train data
        X_stacked.to_parquet(X_stacked_path, index=False)
        """
        # resue old params
        for m in self._stacked_models:
            # print(m.get_type())
            # use only Xgboost, LightGBM and CatBoost as stacked models
            if m.get_type() not in ["Xgboost", "LightGBM", "CatBoost"]:
                continue

            params = copy.deepcopy(m.params)
            params["validation"]["X_train_path"] = X_train_stacked_path

            params["name"] = params["name"] + "_Stacked"
            params["is_stacked"] = True
            # print(params)

            if "model_architecture_json" in params["learner"]:
                # the new model will be created with wider input size
                del params["learner"]["model_architecture_json"]

            if self._ml_task == REGRESSION:
                # scale added predictions in regression if the target was scaled (in the case of NN)
                target_preprocessing = params["preprocessing"]["target_preprocessing"]
                scale = None
                if "scale_log_and_normal" in target_preprocessing:
                    scale = "scale_log_and_normal"
                elif "scale_normal" in target_preprocessing:
                    scale = "scale_normal"
                if scale is not None:
                    for col in added_columns:
                        params["preprocessing"]["columns_preprocessing"][col] = [
                            scale]

            self.train_model(params)
        """

    def _save_data(self, X, y):

        self._X_path = os.path.join(self._results_path, "X.parquet")
        self._y_path = os.path.join(self._results_path, "y.parquet")

        X.to_parquet(self._X_path, index=False)

        # let's check before any conversions
        target_is_numeric = pd.api.types.is_numeric_dtype(y)
        if self._ml_task == MULTICLASS_CLASSIFICATION:
            y = y.astype(str)

        pd.DataFrame({"target": y}).to_parquet(self._y_path, index=False)

        self._validation_strategy["X_path"] = self._X_path
        self._validation_strategy["y_path"] = self._y_path
        self._validation_strategy["results_path"] = self._results_path

        columns_and_target_info = DataInfo.compute(X, y, self._ml_task)

        self.n_features_in_ = X.shape[1]
        self.n_classes = len(np.unique(y[~pd.isnull(y)]))

        self._data_info = {
            "columns": X.columns.tolist(),
            "rows": y.shape[0],
            "cols": X.shape[1],
            "target_is_numeric": target_is_numeric,
            "columns_info": columns_and_target_info["columns_info"],
            "target_info": columns_and_target_info["target_info"],
            "n_features": self.n_features_in_,
        }
        # Add n_classes if not regression
        if self._ml_task != REGRESSION:
            self._data_info["n_classes"] = self.n_classes

        if columns_and_target_info.get("num_class") is not None:
            self._data_info["num_class"] = columns_and_target_info["num_class"]
        data_info_path = os.path.join(self._results_path, "data_info.json")
        with open(data_info_path, "w") as fout:
            fout.write(json.dumps(self._data_info, indent=4))

        self._drop_data_variables(X)

    def _drop_data_variables(self, X):

        X.drop(X.columns, axis=1, inplace=True)

    def _load_data_variables(self, X_train):
        if X_train.shape[1] == 0:
            X = pd.read_parquet(self._X_path)
            for c in X.columns:
                X_train.insert(loc=X_train.shape[1], column=c, value=X[c])

        os.remove(self._X_path)
        os.remove(self._y_path)

    def save_progress(self, step=None, generated_params=None):

        if step is not None and generated_params is not None:
            self._all_params[step] = generated_params

        state = {}

        state["fit_level"] = self._fit_level
        state["time_controller"] = self._time_ctrl.to_json()
        state["all_params"] = self._all_params

        fname = os.path.join(self._results_path, "progress.json")
        with open(fname, "w") as fout:
            fout.write(json.dumps(state, indent=4))

    def load_progress(self):
        state = {}
        fname = os.path.join(self._results_path, "progress.json")
        if not os.path.exists(fname):
            return
        state = json.load(open(fname, "r"))
        self._fit_level = state.get("fit_level", self._fit_level)
        self._all_params = state.get("all_params", self._all_params)
        self._time_ctrl = TimeController.from_json(
            state.get("time_controller"))

    def _validate_X_predict(self, X):
        """Validate X whenever one tries to predict, apply, predict_proba"""
        # X = check_array(X, ensure_2d=False)
        X = np.atleast_2d(X)
        n_features = X.shape[1]
        if self.n_features_in_ != n_features:
            raise ValueError(
                f"Number of features of the model must match the input. Model n_features_in_ is {self.n_features_in_} and input n_features is {n_features}. Reshape your data."
            )

    # This method builds pandas.Dataframe from input. The input can be numpy.ndarray, matrix, or pandas.Dataframe
    # This method is used to build dataframes in `fit()` and in `predict`. That's the reason y can be None (`predict()` method)
    def _build_dataframe(self, X, y=None):
        # If Inputs are not pandas dataframes use scikit-learn validation for X array
        if not isinstance(X, pd.DataFrame):
            # Validate X as array
            X = check_array(X, ensure_2d=False)
            # Force X to be 2D
            X = np.atleast_2d(X)
            # Create Pandas dataframe from np.arrays, columns get names with the schema: feature_{index}
            X = pd.DataFrame(
                X,
                columns=["feature_" + str(i) for i in range(1,
                                                            len(X[0]) + 1)])

        # Enforce column names
        # Enforce X_train columns to be string
        X.columns = X.columns.astype(str)

        X.reset_index(drop=True, inplace=True)

        if y is None:
            return X

        # Check if y is np.ndarray, transform to pd.Series
        if isinstance(y, np.ndarray):
            y = check_array(y, ensure_2d=False)
            y = pd.Series(np.array(y), name="target")
        # if pd.DataFrame, slice first column
        elif isinstance(y, pd.DataFrame):
            y = np.array(y.iloc[:, 0])
            y = check_array(y, ensure_2d=False)
            y = pd.Series(np.array(y), name="target")

        X, y = ExcludeRowsMissingTarget.transform(X, y, warn=True)

        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)

        return X, y

    def _fit(self, X, y):
        """Fits the AutoML model with data"""
        if self._fit_level == "finished":
            print(
                "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new a 'fit()'."
            )
            return
        # Validate input and build dataframes
        X, y = self._build_dataframe(X, y)

        self.n_features_in_ = X.shape[1]
        self.n_classes = len(np.unique(y[~pd.isnull(y)]))

        # Get attributes (__init__ params)
        self._mode = self._get_mode()
        self._ml_task = self._get_ml_task()
        self._results_path = self._get_results_path()
        self._total_time_limit = self._get_total_time_limit()
        self._model_time_limit = self._get_model_time_limit()
        self._algorithms = self._get_algorithms()
        self._train_ensemble = self._get_train_ensemble()
        self._stack_models = self._get_stack_models()
        self._eval_metric = self._get_eval_metric()
        self._validation_strategy = self._get_validation_strategy()
        self._verbose = self._get_verbose()
        self._explain_level = self._get_explain_level()
        self._golden_features = self._get_golden_features()
        self._feature_selection = self._get_feature_selection()
        self._start_random_models = self._get_start_random_models()
        self._hill_climbing_steps = self._get_hill_climbing_steps()
        self._top_models_to_improve = self._get_top_models_to_improve()
        self._random_state = self._get_random_state()

        try:

            self.load_progress()
            if self._fit_level == "finished":
                print(
                    "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'."
                )
                return
            self._check_can_load()

            self.verbose_print(f"AutoML directory: {self._results_path}")
            self.verbose_print(
                f"The task is {self._ml_task} with evaluation metric {self._eval_metric}"
            )
            self.verbose_print(
                f"AutoML will use algorithms: {self._algorithms}")
            if self._stack_models:
                self.verbose_print("AutoML will stack models")
            if self._train_ensemble:
                self.verbose_print("AutoML will ensemble availabe models")

            self._start_time = time.time()
            if self._time_ctrl is not None:
                self._start_time -= self._time_ctrl.already_spend()

            # Automatic Exloratory Data Analysis
            if self._explain_level == 2:
                EDA.compute(X, y, os.path.join(self._results_path, "EDA"))

            # Save data
            self._save_data(X.copy(deep=False), y)

            tuner = MljarTuner(
                self._get_tuner_params(
                    self._start_random_models,
                    self._hill_climbing_steps,
                    self._top_models_to_improve,
                ),
                self._algorithms,
                self._ml_task,
                self._validation_strategy,
                self._explain_level,
                self._data_info,
                self._golden_features,
                self._feature_selection,
                self._train_ensemble,
                self._stack_models,
                self._random_state,
            )
            self.tuner = tuner

            steps = tuner.steps()
            self.verbose_print(f"AutoML steps: {steps}")
            if self._time_ctrl is None:
                self._time_ctrl = TimeController(
                    self._start_time,
                    self._total_time_limit,
                    self._model_time_limit,
                    steps,
                    self._algorithms,
                )

            self._time_ctrl.log_time(
                "prepare_data",
                "prepare_data",
                "prepare_data",
                time.time() - self._start_time,
            )

            for step in steps:
                self._fit_level = step
                start = time.time()
                # self._time_start[step] = start

                if step == "stack":
                    self.prepare_for_stacking()

                generated_params = []
                if step in self._all_params:
                    generated_params = self._all_params[step]
                else:
                    generated_params = tuner.generate_params(
                        step, self._models, self._results_path,
                        self._stacked_models)

                if generated_params is None or not generated_params:
                    self.verbose_print(
                        f"Skip {step} because no parameters were generated.")
                    continue
                if generated_params:
                    if "learner" in generated_params[
                            0] and not self._time_ctrl.enough_time(
                                generated_params[0]["learner"]["model_type"],
                                self._fit_level):
                        self.verbose_print(
                            f"Skip {step} because of the time limit.")
                    else:
                        model_str = "models" if len(
                            generated_params) > 1 else "model"
                        self.verbose_print(
                            f"* Step {step} will try to check up to {len(generated_params)} {model_str}"
                        )

                for params in generated_params:
                    if params.get("status",
                                  "") in ["trained", "skipped", "error"]:
                        self.verbose_print(
                            f"{params['name']}: {params['status']}.")
                        continue

                    try:
                        trained = False
                        if "ensemble" in step:
                            trained = self.ensemble_step(
                                is_stacked=params["is_stacked"])
                        else:
                            trained = self.train_model(params)
                        params["status"] = "trained" if trained else "skipped"
                        params["final_loss"] = self._models[-1].get_final_loss(
                        )
                        params["train_time"] = self._models[-1].get_train_time(
                        )
                    except Exception as e:
                        self._update_errors_report(params.get("name"), str(e))
                        params["status"] = "error"

                    self.save_progress(step, generated_params)

            self._fit_level = "finished"
            self.save_progress()

            self.verbose_print(
                f"AutoML fit time: {np.round(time.time() - self._start_time,2)} seconds"
            )

        except Exception as e:
            raise e
        finally:
            if self._X_path is not None:
                self._load_data_variables(X)

        return self

    def _update_errors_report(self, model_name, error_msg):
        """Append error message to errors.md file. """
        errors_filename = os.path.join(self._get_results_path(), "errors.md")
        with open(errors_filename, "a") as fout:
            self.verbose_print(
                f"There was an error during {model_name} training.")
            self.verbose_print(f"Please check {errors_filename} for details.")
            fout.write(f"## Error for {model_name}\n\n")
            fout.write(error_msg)
            link = "https://github.com/mljar/mljar-supervised/issues/new"
            fout.write(
                f"\n\nPlease set a GitHub issue with above error message at: {link}"
            )
            fout.write("\n\n")

    def select_and_save_best(self):
        # Select best model (lowest loss)
        self._best_model = min(self._models, key=lambda x: x.get_final_loss())

        with open(os.path.join(self._results_path, "best_model.txt"),
                  "w") as fout:
            fout.write(f"{self._best_model.get_name()}")

        with open(os.path.join(self._results_path, "params.json"),
                  "w") as fout:
            params = {
                "ml_task": self._ml_task,
                "eval_metric": self._eval_metric,
                "saved": self._model_paths,
            }
            if self._stacked_models is not None:
                params["stacked"] = [
                    m.get_name() for m in self._stacked_models
                ]
            fout.write(json.dumps(params, indent=4))

        ldb = self.get_leaderboard()
        ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"),
                   index=False)

        # save report
        ldb["Link"] = [
            f"[Results link]({m}/README.md)" for m in ldb["name"].values
        ]
        ldb.insert(loc=0, column="Best model", value="")
        ldb.loc[ldb.name == self._best_model.get_name(),
                "Best model"] = "**the best**"

        with open(os.path.join(self._results_path, "README.md"), "w") as fout:
            fout.write(f"# AutoML Leaderboard\n\n")
            fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe"))
            LeaderboardPlots.compute(ldb, self._results_path, fout)

    def _check_is_fitted(self):
        # First check if model can be loaded
        self._check_can_load()
        # Check if fitted
        if self._fit_level != "finished":
            raise AutoMLException(
                "This model has not been fitted yet. Please call `fit()` first."
            )

    def _base_predict(self, X):
        self._check_is_fitted()

        X = self._build_dataframe(X)
        if not isinstance(X.columns[0], str):
            X.columns = [str(c) for c in X.columns]

        input_columns = X.columns.tolist()
        for column in self._data_info["columns"]:
            if column not in input_columns:
                raise AutoMLException(
                    f"Missing column: {column} in input data. Cannot predict")

        X = X[self._data_info["columns"]]
        self._validate_X_predict(X)

        # is stacked model
        if self._best_model._is_stacked:
            self._perform_model_stacking()
            X_stacked = self.get_stacked_data(X, mode="predict")

            if self._best_model.get_type() == "Ensemble":
                # Ensemble is using both original and stacked data
                predictions = self._best_model.predict(X, X_stacked)
            else:
                predictions = self._best_model.predict(X_stacked)
        else:
            predictions = self._best_model.predict(X)

        if self._ml_task == BINARY_CLASSIFICATION:
            # need to predict the label based on predictions and threshold
            neg_label, pos_label = (
                predictions.columns[0][11:],
                predictions.columns[1][11:],
            )

            if neg_label == "0" and pos_label == "1":
                neg_label, pos_label = 0, 1
            target_is_numeric = self._data_info.get("target_is_numeric", False)
            if target_is_numeric:
                neg_label = int()
                pos_label = int(pos_label)
            # assume that it is binary classification
            predictions[
                "label"] = predictions.iloc[:, 1] > self._best_model._threshold
            predictions["label"] = predictions["label"].map({
                True: pos_label,
                False: neg_label
            })
            return predictions
        elif self._ml_task == MULTICLASS_CLASSIFICATION:
            target_is_numeric = self._data_info.get("target_is_numeric", False)
            if target_is_numeric:
                predictions["label"] = predictions["label"].astype(np.int32)
            return predictions
        # Regression
        else:
            return predictions

    def _predict(self, X):

        predictions = self._base_predict(X)
        # Return predictions
        # If classification task the result is in column 'label'
        # If regression task the result is in column 'prediction'
        return (predictions["label"].to_numpy() if self._ml_task != REGRESSION
                else predictions["prediction"].to_numpy())

    def _predict_proba(self, X):
        # Check is task type is correct
        if self._ml_task == REGRESSION:
            raise AutoMLException(
                f"Method `predict_proba()` can only be used when in classification tasks. Current task: '{self._ml_task}'."
            )

        # Make and return predictions
        # If classification task the result is in column 'label'
        # Need to drop `label` column.
        return self._base_predict(X).drop(["label"], axis=1).to_numpy()

    def _predict_all(self, X):
        # Check is task type is correct
        if self._ml_task == REGRESSION:
            raise AutoMLException(
                f"Method `predict_all()` can only be used when in classification tasks. Current task: '{self._ml_task}'."
            )

        # Make and return predictions
        return self._base_predict(X)

    def _score(self, X, y=None):
        # y default must be None for scikit-learn compatibility

        # Check if y is None
        if y is None:
            raise AutoMLException("y must be specified.")

        predictions = self._predict(X)
        return (r2_score(y, predictions) if self._ml_task == REGRESSION else
                accuracy_score(y, predictions))

    def _get_mode(self):
        """ Gets the current mode"""
        self._validate_mode()
        return deepcopy(self.mode)

    def _get_ml_task(self):
        """ Gets the current ml_task. If "auto" it is determined"""
        self._validate_ml_task()
        if self.ml_task == "auto":
            classes_number = self.n_classes
            if classes_number == 2:
                self._estimator_type = "classifier"  # for sk-learn api
                return BINARY_CLASSIFICATION
            elif classes_number <= 20:
                self._estimator_type = "classifier"  # for sk-learn api
                return MULTICLASS_CLASSIFICATION
            else:
                self._estimator_type = "regressor"  # for sk-learn api
                return REGRESSION
        else:
            return deepcopy(self.ml_task)

    def _get_results_path(self):
        """ Gets the current results_path"""
        # if we already have the results path set, please return it
        if self._results_path is not None:
            return self._results_path

        self._validate_results_path()

        path = self.results_path

        if path is None:
            for i in range(1, 10001):
                name = f"AutoML_{i}"
                if not os.path.exists(name):
                    self.create_dir(name)
                    self._results_path = name
                    return name
            # If it got here, could not create, raise expection
            raise AutoMLException("Cannot create directory for AutoML results")
        elif os.path.exists(self.results_path) and os.path.exists(
                os.path.join(
                    self.results_path,
                    "params.json")):  # AutoML already loaded, return path
            self._results_path = path
            return path
        # Dir does not exist, create it
        elif not os.path.exists(path):
            self.create_dir(path)
            self._results_path = path
            return path
        # Dir exists and is empty, use it
        elif os.path.exists(path) and not len(os.listdir(path)):
            self._results_path = path
            return path
        elif os.path.exists(path) and len(os.listdir(path)):
            raise AutoMLException(
                f"Cannot set directory for AutoML. Directory '{path}' is not empty."
            )

        raise AutoMLException("Cannot set directory for AutoML results")

    def _get_total_time_limit(self):
        """ Gets the current total_time_limit"""
        self._validate_total_time_limit()
        return deepcopy(self.total_time_limit)

    def _get_model_time_limit(self):
        """ Gets the current model_time_limit"""
        self._validate_model_time_limit()
        return deepcopy(self.model_time_limit)

    def _get_algorithms(self):
        """ Gets the current algorithms. If "auto" it is determined"""
        self._validate_algorithms()
        if self.algorithms == "auto":
            if self._get_mode() == "Explain":
                return [
                    "Baseline",
                    "Linear",
                    "Decision Tree",
                    "Random Forest",
                    "Xgboost",
                    "Neural Network",
                ]
            if self._get_mode() == "Perform":
                return [
                    "Linear",
                    "Random Forest",
                    "LightGBM",
                    "Xgboost",
                    "CatBoost",
                    "Neural Network",
                ]
            if self._get_mode() == "Compete":
                return [
                    "Linear",
                    "Decision Tree",
                    "Random Forest",
                    "Extra Trees",
                    "LightGBM",
                    "Xgboost",
                    "CatBoost",
                    "Neural Network",
                    "Nearest Neighbors",
                ]
        else:
            return deepcopy(self.algorithms)

    def _get_train_ensemble(self):
        """ Gets the current train_ensemble"""
        self._validate_train_ensemble()
        return deepcopy(self.train_ensemble)

    def _get_stack_models(self):
        """ Gets the current stack_models"""
        self._validate_stack_models()
        if self.stack_models == "auto":
            return True if self.mode == "Compete" else False
        else:
            return deepcopy(self.stack_models)

    def _get_eval_metric(self):
        """ Gets the current eval_metric"""
        self._validate_eval_metric()
        if self.eval_metric == "auto":
            if self._get_ml_task() == BINARY_CLASSIFICATION:
                return "logloss"
            elif self._get_ml_task() == MULTICLASS_CLASSIFICATION:
                return "logloss"
            elif self._get_ml_task() == REGRESSION:
                return "rmse"
        else:
            return deepcopy(self.eval_metric)

    def _get_validation_strategy(self):
        """ Gets the current validation_strategy"""
        strat = {}
        self._validate_validation_strategy()
        if self.validation_strategy == "auto":
            if self._get_mode() == "Explain":
                strat = {
                    "validation_type": "split",
                    "train_ratio": 0.75,
                    "shuffle": True,
                    "stratify": True,
                }
            elif self._get_mode() == "Perform":
                strat = {
                    "validation_type": "kfold",
                    "k_folds": 5,
                    "shuffle": True,
                    "stratify": True,
                }
            elif self._get_mode() == "Compete":
                strat = {
                    "validation_type": "kfold",
                    "k_folds": 10,
                    "shuffle": True,
                    "stratify": True,
                }
            if self._get_ml_task() == REGRESSION:
                if "stratify" in strat:
                    # it's better to always check
                    # before delete (trust me)
                    del strat["stratify"]
            return strat
        else:
            strat = deepcopy(self.validation_strategy)
            if "stratify" in strat:
                del strat["stratify"]
            return strat

    def _get_verbose(self):
        """Gets the current verbose"""
        self._validate_verbose()
        return deepcopy(self.verbose)

    def _get_explain_level(self):
        """ Gets the current explain_level"""
        self._validate_explain_level()
        if self.explain_level == "auto":
            if self._get_mode() == "Explain":
                return 2
            if self._get_mode() == "Perform":
                return 1
            if self._get_mode() == "Compete":
                return 0
        else:
            return deepcopy(self.explain_level)

    def _get_golden_features(self):
        self._validate_golden_features()
        if self.golden_features == "auto":
            if self._get_mode() == "Explain":
                return False
            if self._get_mode() == "Perform":
                return True
            if self._get_mode() == "Compete":
                return True
        else:
            return deepcopy(self.golden_features)

    def _get_feature_selection(self):
        """ Gets the current feature_selection"""
        self._validate_feature_selection()
        if self.feature_selection == "auto":
            if self._get_mode() == "Explain":
                return False
            if self._get_mode() == "Perform":
                return True
            if self._get_mode() == "Compete":
                return True
        else:
            return deepcopy(self.feature_selection)

    def _get_start_random_models(self):
        """ Gets the current start_random_models"""
        self._validate_start_random_models()
        if self.start_random_models == "auto":
            if self._get_mode() == "Explain":
                return 1
            if self._get_mode() == "Perform":
                return 5
            if self._get_mode() == "Compete":
                return 10
        else:
            return deepcopy(self.start_random_models)

    def _get_hill_climbing_steps(self):
        """ Gets the current hill_climbing_steps"""
        self._validate_hill_climbing_steps()
        if self.hill_climbing_steps == "auto":
            if self._get_mode() == "Explain":
                return 0
            if self._get_mode() == "Perform":
                return 2
            if self._get_mode() == "Compete":
                return 2
        else:
            return deepcopy(self.hill_climbing_steps)

    def _get_top_models_to_improve(self):
        """ Gets the current top_models_to_improve"""
        self._validate_top_models_to_improve()
        if self.top_models_to_improve == "auto":
            if self._get_mode() == "Explain":
                return 0
            if self._get_mode() == "Perform":
                return 2
            if self._get_mode() == "Compete":
                return 3
        else:
            return deepcopy(self.top_models_to_improve)

    def _get_random_state(self):
        """ Gets the current random_state"""
        self._validate_random_state()
        return deepcopy(self.random_state)

    def _validate_mode(self):
        """ Validates mode parameter"""
        valid_modes = ["Explain", "Perform", "Compete"]
        if self.mode not in valid_modes:
            raise ValueError(
                f"Expected 'mode' to be {' or '.join(valid_modes)}, got '{self.mode}'"
            )

    def _validate_ml_task(self):
        """ Validates ml_task parameter"""
        if isinstance(self.ml_task, str) and self.ml_task == "auto":
            return

        if self.ml_task not in AlgorithmsRegistry.get_supported_ml_tasks():
            raise ValueError(
                f"Expected 'ml_task' to be {' or '.join(AlgorithmsRegistry.get_supported_ml_tasks())}, got '{self.ml_task}''"
            )

    def _validate_results_path(self):
        """ Validates path parameter"""
        if self.results_path is None or isinstance(self.results_path, str):
            return

        raise ValueError(
            f"Expected 'results_path' to be of type string, got '{type(self.results_path)}''"
        )

    def _validate_total_time_limit(self):
        """ Validates total_time_limit parameter"""
        check_greater_than_zero_integer(self.total_time_limit,
                                        "total_time_limit")

    def _validate_model_time_limit(self):
        """ Validates model_time_limit parameter"""
        if self.model_time_limit is not None:
            check_greater_than_zero_integer(self.model_time_limit,
                                            "model_time_limit")

    def _validate_algorithms(self):
        """ Validates algorithms parameter"""
        if isinstance(self.algorithms, str) and self.algorithms == "auto":
            return

        for algo in self.algorithms:
            if algo not in list(
                    AlgorithmsRegistry.registry[self._ml_task].keys()):
                raise ValueError(
                    f"The algorithm {algo} is not allowed to use for ML task: {self._ml_task}. Allowed algorithms: {list(AlgorithmsRegistry.registry[self._ml_task].keys())}"
                )

    def _validate_train_ensemble(self):
        """ Validates train_ensemble parameter"""
        # `train_ensemble` defaults to True, no further checking required
        check_bool(self.train_ensemble, "train_ensemble")

    def _validate_stack_models(self):
        """ Validates stack_models parameter"""
        # `stack_models` defaults to "auto". If "auto" return, else check if is valid bool
        if isinstance(self.stack_models, str) and self.stack_models == "auto":
            return

        check_bool(self.stack_models, "stack_models")

    def _validate_eval_metric(self):
        """ Validates eval_metric parameter"""
        # `stack_models` defaults to "auto". If not "auto", check if is valid bool
        if isinstance(self.eval_metric, str) and self.eval_metric == "auto":
            return

        if (self._get_ml_task() == BINARY_CLASSIFICATION
                or self._get_ml_task() == MULTICLASS_CLASSIFICATION
            ) and self.eval_metric != "logloss":
            raise ValueError(
                f"Metric {self.eval_metric} is not allowed in ML task: {self._get_ml_task()}. \
                    Use 'log_loss'")

        elif self._get_ml_task() == REGRESSION and self.eval_metric != "rmse":
            raise ValueError(
                f"Metric {self.eval_metric} is not allowed in ML task: {self._get_ml_task()}. \
                Use 'rmse'")

    def _validate_validation_strategy(self):
        """ Validates validation parameter"""
        if (isinstance(self.validation_strategy, str)
                and self.validation_strategy == "auto"):
            return

        # only validation_type is mandatory
        # other parameters of validations
        # have defaults set in their constructors
        required_keys = ["validation_type"]
        if type(self.validation_strategy) is not dict:
            raise ValueError(
                f"Expected 'validation_strategy' to be a dict, got '{type(self.validation_strategy)}'"
            )
        if not all(key in self.validation_strategy for key in required_keys):
            raise ValueError(
                f"Expected dict with keys: {' , '.join(required_keys)}")

    def _validate_verbose(self):
        """ Validates verbose parameter"""
        check_positive_integer(self.verbose, "verbose")

    def _validate_explain_level(self):
        """ Validates explain_level parameter"""
        if isinstance(self.explain_level,
                      str) and self.explain_level == "auto":
            return
        valid_explain_levels = [0, 1, 2]
        # Check if explain level is 0 or greater integer
        if not (isinstance(self.explain_level, int)
                and self.explain_level in valid_explain_levels):
            raise ValueError(
                f"Expected 'explain_level' to be {' or '.join([str(x) for x in valid_explain_levels])}, got '{self.explain_level}'"
            )

    def _validate_golden_features(self):
        """ Validates golden_features parameter"""
        if isinstance(self.golden_features,
                      str) and self.golden_features == "auto":
            return
        check_bool(self.golden_features, "golden_features")

    def _validate_feature_selection(self):
        """ Validates feature_selection parameter"""
        if isinstance(self.feature_selection,
                      str) and self.feature_selection == "auto":
            return
        check_bool(self.feature_selection, "feature_selection")

    def _validate_start_random_models(self):
        """ Validates start_random_models parameter"""
        if (isinstance(self.start_random_models, str)
                and self.start_random_models == "auto"):
            return
        check_greater_than_zero_integer(self.start_random_models,
                                        "start_random_models")

    def _validate_hill_climbing_steps(self):
        """ Validates hill_climbing_steps parameter"""
        if (isinstance(self.hill_climbing_steps, str)
                and self.hill_climbing_steps == "auto"):
            return
        check_positive_integer(self.hill_climbing_steps, "hill_climbing_steps")

    def _validate_top_models_to_improve(self):
        """ Validates top_models_to_improve parameter"""
        if (isinstance(self.top_models_to_improve, str)
                and self.top_models_to_improve == "auto"):
            return
        check_positive_integer(self.top_models_to_improve,
                               "top_models_to_improve")

    def _validate_random_state(self):
        """ Validates random_state parameter"""
        check_positive_integer(self.random_state, "random_state")

    def to_json(self):
        if self._best_model is None:
            return None

        return {
            "best_model": self._best_model.to_json(),
            "threshold": self._threshold,
            "ml_task": self._ml_task,
        }

    def from_json(self, json_data):

        if json_data["best_model"]["algorithm_short_name"] == "Ensemble":
            self._best_model = Ensemble()
            self._best_model.from_json(json_data["best_model"])
        else:
            self._best_model = ModelFramework(
                json_data["best_model"].get("params"))
            self._best_model.from_json(json_data["best_model"])
        self._threshold = json_data.get("threshold")

        self._ml_task = json_data.get("ml_task")
Exemplo n.º 3
0
class AutoML:
    """
    Automated Machine Learning for supervised tasks (binary classification, multiclass classification, regression).
    """
    def __init__(
        self,
        results_path=None,
        total_time_limit=60 * 60,
        model_time_limit=None,
        algorithms=[
            "Baseline",
            "Linear",
            "Decision Tree",
            "Random Forest",
            "Extra Trees",
            "LightGBM",
            "Xgboost",
            "CatBoost",
            "Neural Network",
            "Nearest Neighbors",
        ],
        tuning_mode="Normal",
        train_ensemble=True,
        stack=True,
        optimize_metric=None,
        validation={
            "validation_type": "kfold",
            "k_folds": 10,
            "shuffle": True,
            "stratify": True,
        },
        verbose=True,
        ml_task=None,
        explain_level=2,
        seed=1,
    ):
        """
        Create the AutoML object. Initialize directory for results.

        :param results_path: The path where all results will be saved. 
        If left `None` then the name of directory will be generated, with schema: AutoML_{number},
        where number can be from 1 to 100 - depends which direcory name will be available.

        If the `results_path` will point to directory with AutoML results, then all models will be loaded.
        
        :param total_time_limit: The time limit in seconds for AutoML training. It is not used when `model_time_limit` is not `None`.
        
        :param model_time_limit: The time limit in seconds for training single model. 
        If `model_time_limit` is set, the `total_time_limit` is not respected. 
        Single model can contain several learners, for example in the case of 10-fold cross-validation, one model will have 10 learners.
        Based on `model_time_limit` the time limit for single learner is computed.
        
        :param algorithms: The list of algorithms that will be used in the training.
        
        :param tuning_mode: The mode for tuning. It can be: `Normal`, `Sport`, `Insane`, `Perfect`. The names are kept the same as in https://mljar.com application.
        
        Each mode describe how many models will be checked:
        
        - `Normal` - about 5-10 models of each algorithm will be trained,
        - `Sport` - about 10-15 models of each algorithm will be trained,
        - `Insane` - about 15-20 models of each algorithm will be trained,
        - `Perfect` - about 25-35 models of each algorithm will be trained.
        
        You can also set how many models will be trained with `set_advanced` method.
        
        :param train_ensemble: If true then at the end of models training the ensemble will be created. (Default is `True`)

        :param stack: If true then stacked models will be created. Stack level is 1. (Default is `True`)
        
        :param optimize_metric: The metric to be optimized. (not implemented yet, please left `None`)
        
        :param validation: The JSON with validation type. Right now only Cross-Validation is supported. 
        The example JSON parameters for validation:
        ```
        {"validation_type": "kfold", "k_folds": 5, "shuffle": True, "stratify": True, "random_seed": 123}
        ```
        :param verbose: Not implemented yet.
        :param ml_task: The machine learning task that will be solved. Can be: `"binary_classification", "multiclass_classification", "regression"`.
        If left `None` AutoML will try to guess the task based on target values. 
        If there will be only 2 values in the target, then task will be set to `"binary_classification"`.
        If number of values in the target will be between 2 and 20 (included), then task will be set to `"multiclass_classification"`.
        In all other casses, the task is set to `"regression"`.
        
        :param explain_level: The level of explanations included to each model.
        `explain_level = 0` means no explanations
        `explain_level = 1` means produce importance plot (with permutation method), for decision trees produce tree plots, for linear models save coefficients
        `explain_level = 2` the same as for `1` plus SHAP explanations
        :param seed: The seed for random generator.
        
        """
        logger.debug("AutoML.__init__")

        # total_time_limit is the time for computing for all models
        # model_time_limit is the time for computing a single model
        # if model_time_limit is None then its value is computed from total_time_limit
        # if total_time_limit is set and model_time_limit is set, then total_time_limit constraint will be omitted
        self._total_time_limit = total_time_limit
        self._model_time_limit = model_time_limit
        # time limit in seconds for single learner (model consists of learners)
        # the value is computed before fit, initilize with any number
        self._time_limit = 1

        self._train_ensemble = train_ensemble
        self._stack = stack
        self._models = [
        ]  # instances of iterative learner framework or ensemble

        # it is instance of model framework or ensemble
        self._best_model = None
        self._validation = validation
        self.set_tuning_mode(tuning_mode)

        self._algorithms = algorithms
        self._verbose = verbose

        self._fit_time = None
        self._models_train_time = {}
        self._threshold, self._metrics_details, self._max_metrics, self._confusion_matrix = (
            None,
            None,
            None,
            None,
        )
        self._seed = seed
        self._user_set_optimize_metric = optimize_metric
        self._ml_task = ml_task

        self._X_train_path, self._y_train_path = None, None
        self._X_validation_path, self._y_validation_path = None, None

        self._data_info = None
        self._model_paths = []
        self._stacked_models = None
        self._explain_level = explain_level
        self._results_path = results_path
        self._fit_level = None
        self._time_spend = {}
        self._start_time = time.time()  # it will be updated in `fit` method

        if self._validation["validation_type"] != "kfold":
            # stack only available of k-fold validation
            self._stack = False

        # this should be last in the constrcutor
        # in case there is a dir, it might load models
        self._set_results_dir()

    def set_tuning_mode(self, mode="Normal"):
        if mode == "Sport":
            self._start_random_models = 10
            self._hill_climbing_steps = 2
            self._top_models_to_improve = 3
        elif mode == "Insane":
            self._start_random_models = 15
            self._hill_climbing_steps = 3
            self._top_models_to_improve = 4
        elif mode == "Perfect":
            self._start_random_models = 25
            self._hill_climbing_steps = 5
            self._top_models_to_improve = 5
        else:  # Normal
            self._start_random_models = 5
            self._hill_climbing_steps = 1
            self._top_models_to_improve = 2
        self._tuner_params = {
            "start_random_models": self._start_random_models,
            "hill_climbing_steps": self._hill_climbing_steps,
            "top_models_to_improve": self._top_models_to_improve,
        }

    def set_advanced(self,
                     start_random_models=1,
                     hill_climbing_steps=0,
                     top_models_to_improve=0):
        """
        Advanced set of tuning parameters. 

        :param start_random_models: Number of not-so-random models to check for each algorithm.
        :param hill_climbing_steps: Number of hill climbing steps during tuning.
        :param top_models_to_improve: Number of top models (of each algorithm) which will be considered for improving in hill climbing steps.
        """
        self._start_random_models = start_random_models
        self._hill_climbing_steps = hill_climbing_steps
        self._top_models_to_improve = top_models_to_improve
        self._tuner_params = {
            "start_random_models": self._start_random_models,
            "hill_climbing_steps": self._hill_climbing_steps,
            "top_models_to_improve": self._top_models_to_improve,
        }

    def _set_results_dir(self):
        if self._results_path is None:
            found = False
            for i in range(1, 10001):
                self._results_path = f"AutoML_{i}"
                if not os.path.exists(self._results_path):
                    found = True
                    break
            if not found:
                raise AutoMLException(
                    "Cannot create directory for AutoML results")

        if os.path.exists(self._results_path) and os.path.exists(
                os.path.join(self._results_path, "params.json")):
            print(f"Directory {self._results_path} already exists")
            self.load()
        elif self._results_path is not None:

            if not os.path.exists(self._results_path):
                print(f"Create directory {self._results_path}")
                try:
                    os.mkdir(self._results_path)
                except Exception as e:
                    raise AutoMLException(
                        f"Cannot create directory {self._results_path}")
            elif os.path.exists(self._results_path) and len(
                    os.listdir(self._results_path)):
                raise AutoMLException(
                    f"Cannot set directory for AutoML. Directory {self._results_path} is not empty."
                )
        else:
            raise AutoMLException("Cannot set directory for AutoML results")

    def load(self):
        logger.info("Loading AutoML models ...")
        try:
            params = json.load(
                open(os.path.join(self._results_path, "params.json")))

            self._model_paths = params["saved"]
            self._ml_task = params["ml_task"]
            self._optimize_metric = params["optimize_metric"]
            stacked_models = params.get("stacked")

            models_map = {}
            for model_path in self._model_paths:
                if model_path.endswith("Ensemble") or model_path.endswith(
                        "Ensemble_Stacked"):
                    ens = Ensemble.load(model_path, models_map)
                    self._models += [ens]
                    models_map[ens.get_name()] = ens
                else:
                    m = ModelFramework.load(model_path)
                    self._models += [m]
                    models_map[m.get_name()] = m

            if stacked_models is not None:
                self._stacked_models = []
                for stacked_model_name in stacked_models:
                    self._stacked_models += [models_map[stacked_model_name]]

            best_model_name = None
            with open(os.path.join(self._results_path, "best_model.txt"),
                      "r") as fin:
                best_model_name = fin.read()

            self._best_model = models_map[best_model_name]

            data_info_path = os.path.join(self._results_path, "data_info.json")
            self._data_info = json.load(open(data_info_path))
        except Exception as e:
            raise AutoMLException(f"Cannot load AutoML directory. {str(e)}")

    def get_leaderboard(self):
        ldb = {
            "name": [],
            "model_type": [],
            "metric_type": [],
            "metric_value": [],
            "train_time": [],
        }
        for m in self._models:
            ldb["name"] += [m.get_name()]
            ldb["model_type"] += [m.get_type()]
            ldb["metric_type"] += [self._optimize_metric]
            ldb["metric_value"] += [m.get_final_loss()]
            ldb["train_time"] += [np.round(m.get_train_time(), 2)]
        return pd.DataFrame(ldb)

    def keep_model(self, model):
        if model is None:
            return
        self._models += [model]
        self.verbose_print("{} final {} {} time {} seconds".format(
            model.get_name(),
            self._optimize_metric,
            model.get_final_loss(),
            np.round(model.get_train_time(), 2),
        ))
        self.log_train_time(model.get_type(), model.get_train_time())

    def _get_learner_time_limit(self, model_type):

        logger.debug(
            f"Fit level: {self._fit_level}, model type: {model_type}. " +
            f"Time spend: {json.dumps(self._time_spend, indent=4)}")

        if self._model_time_limit is not None:
            k = self._validation.get("k_folds", 1.0)
            return self._model_time_limit / k

        if self._fit_level == "simple_algorithms":
            return None
        if self._fit_level == "default_algorithms":
            return None

        tune_algorithms = [
            a for a in self._algorithms if a not in
            ["Baseline", "Linear", "Decision Tree", "Nearest Neighbors"]
        ]
        tune_algs_cnt = len(tune_algorithms)
        if tune_algs_cnt == 0:
            return None

        time_elapsed = time.time() - self._start_time
        time_left = self._total_time_limit - time_elapsed

        k_folds = self._validation.get("k_folds", 1.0)

        if self._fit_level == "not_so_random":
            tt = (self._total_time_limit -
                  self._time_spend["simple_algorithms"] -
                  self._time_spend["default_algorithms"])
            if self._stack:
                tt *= (
                    0.6
                )  # leave some time for stacking (approx. 40% for stacking of time left)
            tt /= 2.0  # leave some time for hill-climbing
            tt /= tune_algs_cnt  # give time equally for each algorithm
            tt /= k_folds  # time is per learner (per fold)
            return tt

        if self._fit_level == "hill_climbing":
            tt = (self._total_time_limit -
                  self._time_spend["simple_algorithms"] -
                  self._time_spend["default_algorithms"] -
                  self._time_spend["not_so_random"])
            if self._stack:
                tt *= (
                    0.4
                )  # leave some time for stacking (approx. 60% for stacking of time left)
            tt /= tune_algs_cnt  # give time equally for each algorithm
            tt /= k_folds  # time is per learner (per fold)
            return tt

        if self._stack and self._fit_level == "stack":
            tt = time_left
            tt /= tune_algs_cnt  # give time equally for each algorithm
            tt /= k_folds  # time is per learner (per fold)
            return tt

    def train_model(self, params):

        model_path = os.path.join(self._results_path, params["name"])
        early_stop = EarlyStopping({
            "metric": {
                "name": self._optimize_metric
            },
            "log_to_dir": model_path
        })

        learner_time_constraint = LearnerTimeConstraint({
            "learner_time_limit":
            self._get_learner_time_limit(
                params["learner"]["model_type"]),  # self._time_limit,
            "min_steps":
            params["additional"].get("min_steps"),
        })

        total_time_constraint = TotalTimeConstraint({
            "total_time_limit":
            self._total_time_limit if self._model_time_limit is None else None,
            "total_time_start":
            self._start_time,
        })

        mf = ModelFramework(
            params,
            callbacks=[
                early_stop, learner_time_constraint, total_time_constraint
            ],
        )

        if self._enough_time_to_train(mf.get_type()):

            # self.verbose_print(params["name"] + " training start ...")
            logger.info(
                f"Train model #{len(self._models)+1} / Model name: {params['name']}"
            )

            try:
                os.mkdir(model_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {model_path}")

            mf.train(model_path)

            mf.save(model_path)
            self._model_paths += [model_path]

            self.keep_model(mf)

            # save the best one in the case the training will be interrupted
            self.select_and_save_best()
        else:
            logger.info(
                f"Cannot train {mf.get_type()} because of time constraint")
        # self._progress_bar.update(1)

    def verbose_print(self, msg):
        if self._verbose:
            # self._progress_bar.write(msg)
            print(msg)

    def log_train_time(self, model_type, train_time):
        if model_type in self._models_train_time:
            self._models_train_time[model_type] += [train_time]
        else:
            self._models_train_time[model_type] = [train_time]

    def _enough_time_to_train(self, model_type):
        # if model_time_limit is set, train every model
        # do not apply total_time_limit
        if self._model_time_limit is not None:
            return True
        # no total time limit, just train, dont ask
        if self._total_time_limit is None:
            return True

        total_time_spend = time.time() - self._start_time
        # no time left, do not train more models, sorry ...
        time_left = self._total_time_limit - total_time_spend
        if time_left < 0:
            return False

        # there is still time and model_type was not tested yet
        # we should try it
        if time_left > 0 and model_type not in self._models_train_time:
            return True

        # check the fit level type
        # we dont want to spend too much time on one level

        if self._fit_level == "not_so_random":

            time_should_use = (self._total_time_limit -
                               self._time_spend["simple_algorithms"] -
                               self._time_spend["default_algorithms"])
            if self._stack:
                time_should_use *= 0.6  # leave time for stacking
            if self._hill_climbing_steps > 0:
                time_should_use /= 2.0  # leave time for hill-climbing

            if (total_time_spend >
                    time_should_use + self._time_spend["simple_algorithms"] +
                    self._time_spend["default_algorithms"]):
                return False

        ##################
        # hill climbing check

        if self._fit_level == "hill_climbing":

            time_should_use = (self._total_time_limit -
                               self._time_spend["simple_algorithms"] -
                               self._time_spend["default_algorithms"] -
                               self._time_spend["not_so_random"])
            if self._stack:
                time_should_use *= 0.4  # leave time for stacking

            if (total_time_spend >
                    time_should_use + self._time_spend["simple_algorithms"] +
                    self._time_spend["default_algorithms"] +
                    self._time_spend["not_so_random"]):
                return False

        model_total_time_spend = (0 if model_type
                                  not in self._models_train_time else np.sum(
                                      self._models_train_time[model_type]))
        model_mean_time_spend = (0 if model_type not in self._models_train_time
                                 else np.mean(
                                     self._models_train_time[model_type]))

        algo_cnt = float(len(self._algorithms))
        for a in ["Baseline", "Decision Tree", "Linear", "Nearest Neighbors"]:
            if a in self._algorithms:
                algo_cnt -= 1.0
        if algo_cnt < 1.0:
            algo_cnt = 1.0

        model_time_left = time_left / algo_cnt
        if model_mean_time_spend <= model_time_left:
            return True

        return False

    def ensemble_step(self, is_stacked=False):
        if self._train_ensemble and len(self._models) > 1:
            self.ensemble = Ensemble(self._optimize_metric,
                                     self._ml_task,
                                     is_stacked=is_stacked)
            oofs, target = self.ensemble.get_oof_matrix(self._models)
            self.ensemble.fit(oofs, target)
            self.keep_model(self.ensemble)

            ensemble_path = os.path.join(
                self._results_path,
                "Ensemble_Stacked" if is_stacked else "Ensemble")
            try:
                os.mkdir(ensemble_path)
            except Exception as e:
                raise AutoMLException(
                    f"Cannot create directory {ensemble_path}")
            self.ensemble.save(ensemble_path)
            self._model_paths += [ensemble_path]
            # save the best one in the case the training will be interrupted
            self.select_and_save_best()

    def can_we_stack_them(self, y):
        # if multiclass and too many classes then No
        return True

    def get_stacked_data(self, X, mode="training"):
        # mode can be `training` or `predict`
        if self._stacked_models is None:
            return X
        all_oofs = []
        for m in self._stacked_models:
            oof = None
            if mode == "training":
                oof = m.get_out_of_folds()
            else:
                oof = m.predict(X)
                if self._ml_task == BINARY_CLASSIFICATION:
                    cols = [f for f in oof.columns if "prediction" in f]
                    if len(cols) == 2:
                        oof = pd.DataFrame({"prediction": oof[cols[1]]})

            cols = [f for f in oof.columns if "prediction" in f]
            oof = oof[cols]
            oof.columns = [f"{m.get_name()}_{c}" for c in cols]
            all_oofs += [oof]

        org_index = X.index.copy()
        X.reset_index(drop=True, inplace=True)
        X_stacked = pd.concat(all_oofs + [X], axis=1)

        X_stacked.index = org_index.copy()
        X.index = org_index.copy()
        return X_stacked

    def stack_models(self):

        if self._stacked_models is not None:
            return

        ldb = self.get_leaderboard()
        ldb = ldb.sort_values(by="metric_value", ascending=True)

        models_map = {
            m.get_name(): m
            for m in self._models if not m._is_stacked
        }
        self._stacked_models = []
        models_limit = 10

        for model_type in np.unique(ldb.model_type):
            if model_type in ["Baseline"]:
                continue
            ds = ldb[ldb.model_type == model_type].copy()
            ds.sort_values(by="metric_value", inplace=True)

            for n in list(ds.name.iloc[:models_limit].values):
                self._stacked_models += [models_map[n]]

        scores = [m.get_final_loss() for m in self._stacked_models]
        self._stacked_models = [
            self._stacked_models[i] for i in np.argsort(scores).tolist()
        ]

    def stacked_ensemble_step(self):
        # print("Stacked models ....")
        # do we have enough models?
        if len(self._models) < 5:
            return
        # do we have time?
        if self._total_time_limit is not None:
            time_left = self._total_time_limit - (time.time() -
                                                  self._start_time)
            # we need at least 60 seconds to do anything
            if time_left < 60:
                return

        # read X directly from parquet
        X = pd.read_parquet(self._X_train_path)

        self.stack_models()

        org_columns = X.columns.tolist()
        X_stacked = self.get_stacked_data(X)
        new_columns = X_stacked.columns.tolist()
        added_columns = [c for c in new_columns if c not in org_columns]

        # save stacked data
        X_train_stacked_path = os.path.join(self._results_path,
                                            "X_train_stacked.parquet")
        X_stacked.to_parquet(X_train_stacked_path, index=False)

        # resue old params
        for m in self._stacked_models:
            # print(m.get_type())
            # use only Xgboost, LightGBM and CatBoost as stacked models
            if m.get_type() not in ["Xgboost", "LightGBM", "CatBoost"]:
                continue

            params = copy.deepcopy(m.params)
            params["validation"]["X_train_path"] = X_train_stacked_path

            params["name"] = params["name"] + "_Stacked"
            params["is_stacked"] = True
            # print(params)

            if "model_architecture_json" in params["learner"]:
                # the new model will be created with wider input size
                del params["learner"]["model_architecture_json"]

            if self._ml_task == REGRESSION:
                # scale added predictions in regression if the target was scaled (in the case of NN)
                target_preprocessing = params["preprocessing"][
                    "target_preprocessing"]
                scale = None
                if "scale_log_and_normal" in target_preprocessing:
                    scale = "scale_log_and_normal"
                elif "scale_normal" in target_preprocessing:
                    scale = "scale_normal"
                if scale is not None:
                    for col in added_columns:
                        params["preprocessing"]["columns_preprocessing"][
                            col] = [scale]

            self.train_model(params)

    def _set_ml_task(self, y):
        """ Set and validate the ML task.
        
        If ML task is not set, it trys to guess ML task based on count of unique values in the target. 
        Then it performs validation.
        """
        # if not set, guess
        if self._ml_task is None:
            target_unique_cnt = len(np.unique(y[~pd.isnull(y)]))
            if target_unique_cnt == 2:
                self._ml_task = BINARY_CLASSIFICATION
            elif target_unique_cnt <= 20:
                self._ml_task = MULTICLASS_CLASSIFICATION
            else:
                self._ml_task = REGRESSION
        # validation
        if self._ml_task not in AlgorithmsRegistry.get_supported_ml_tasks():
            raise Exception("Unknow Machine Learning task {}."
                            " Supported tasks are: {}".format(
                                self._ml_task,
                                AlgorithmsRegistry.get_supported_ml_tasks()))
        if self._ml_task == REGRESSION:
            if "stratify" in self._validation:
                del self._validation["stratify"]
        logger.info("AutoML task to be solved: {}".format(self._ml_task))
        print(f"AutoML task to be solved: { self._ml_task}")

    def _set_algorithms(self):
        """ Set and validate available algorithms.

        If algorithms are not set, all algorithms from registry are used.
        Then perform vadlidation of algorithms.
        """
        if len(self._algorithms) == 0:
            self._algorithms = list(
                AlgorithmsRegistry.registry[self._ml_task].keys())

        for a in self._algorithms:
            if a not in list(
                    AlgorithmsRegistry.registry[self._ml_task].keys()):
                raise AutoMLException(
                    "The algorithm {} is not allowed to use for ML task: {}. Allowed algorithms: {}"
                    .format(
                        a,
                        self._ml_task,
                        list(
                            AlgorithmsRegistry.registry[self._ml_task].keys()),
                    ))
        logger.info("AutoML will use algorithms: {}".format(self._algorithms))
        print(f"AutoML will use algorithms: {self._algorithms}")

    def _set_metric(self):
        """ Set and validate the metric to be optimized. """
        if self._ml_task == BINARY_CLASSIFICATION:
            if self._user_set_optimize_metric is None:
                self._optimize_metric = "logloss"
            elif self._user_set_optimize_metric not in ["logloss", "auc"]:
                raise AutoMLException(
                    "Metric {} is not allowed in ML task: {}".format(
                        self._user_set_optimize_metric, self._ml_task))
            else:
                self._optimize_metric = self._user_set_optimize_metric
        elif self._ml_task == MULTICLASS_CLASSIFICATION:
            if self._user_set_optimize_metric is None:
                self._optimize_metric = "logloss"
            elif self._user_set_optimize_metric not in ["logloss"]:
                raise AutoMLException(
                    "Metric {} is not allowed in ML task: {}".format(
                        self._user_set_optimize_metric, self._ml_task))
            else:
                self._optimize_metric = self._user_set_optimize_metric
        elif self._ml_task == REGRESSION:
            if self._user_set_optimize_metric is None:
                self._optimize_metric = "rmse"
            elif self._user_set_optimize_metric not in ["rmse"]:
                raise AutoMLException(
                    "Metric {} is not allowed in ML task: {}".format(
                        self._user_set_optimize_metric, self._ml_task))
            else:
                self._optimize_metric = self._user_set_optimize_metric
        logger.info("AutoML will optimize for metric: {0}".format(
            self._optimize_metric))
        print(f"AutoML will optimize for metric: {self._optimize_metric}")

    def _check_imbalanced(self, y):
        v = y.value_counts()
        # at least 10 samples of each class
        ii = v < 10
        if np.sum(ii):
            raise AutoMLException(
                f"There need to be at least 10 samples of each class, for class {list(v[ii].index)} there is {v[ii].values} samples"
            )
        # at least 1% of all samples for each class
        v = y.value_counts(normalize=True) * 100.0
        ii = v < 1.0
        if np.sum(ii):
            raise AutoMLException(
                f"There need to be at least 1% of samples of each class, for class {list(v[ii].index)} there is {v[ii].values} % of samples"
            )

    def _initial_prep(self,
                      X_train,
                      y_train,
                      X_validation=None,
                      y_validation=None):

        if not isinstance(X_train, pd.DataFrame):
            X_train = pd.DataFrame(X_train)

        if not isinstance(X_train.columns[0], str):
            X_train.columns = [str(c) for c in X_train.columns]

        X_train.reset_index(drop=True, inplace=True)

        if isinstance(y_train, pd.DataFrame):
            if "target" not in y_train.columns:
                raise AutoMLException(
                    "y_train should be Numpy array, Pandas Series or DataFrame with column 'target' "
                )
            else:
                y_train = y_train["target"]
        y_train = pd.Series(np.array(y_train), name="target")

        X_train, y_train = ExcludeRowsMissingTarget.transform(X_train,
                                                              y_train,
                                                              warn=True)

        return X_train, y_train, X_validation, y_validation

    def _save_data(self,
                   X_train,
                   y_train,
                   X_validation=None,
                   y_validation=None):

        self._X_train_path = os.path.join(self._results_path,
                                          "X_train.parquet")
        self._y_train_path = os.path.join(self._results_path,
                                          "y_train.parquet")

        X_train.to_parquet(self._X_train_path, index=False)

        pd.DataFrame({
            "target": y_train
        }).to_parquet(self._y_train_path, index=False)

        self._validation["X_train_path"] = self._X_train_path
        self._validation["y_train_path"] = self._y_train_path
        self._validation["results_path"] = self._results_path

        columns_and_target_info = DataInfo.compute(X_train, y_train,
                                                   self._ml_task)

        self._data_info = {
            "columns": X_train.columns.tolist(),
            "rows": X_train.shape[0],
            "cols": X_train.shape[1],
            "target_is_numeric": pd.api.types.is_numeric_dtype(y_train),
            "columns_info": columns_and_target_info["columns_info"],
            "target_info": columns_and_target_info["target_info"],
        }
        if columns_and_target_info.get("num_class") is not None:
            self._data_info["num_class"] = columns_and_target_info["num_class"]
        data_info_path = os.path.join(self._results_path, "data_info.json")
        with open(data_info_path, "w") as fout:
            fout.write(json.dumps(self._data_info, indent=4))

        self._drop_data_variables(X_train)

    def _drop_data_variables(self, X_train):

        X_train.drop(X_train.columns, axis=1, inplace=True)

    def _load_data_variables(self, X_train):
        if X_train.shape[1] == 0:
            X = pd.read_parquet(self._X_train_path)
            for c in X.columns:
                X_train.insert(loc=X_train.shape[1], column=c, value=X[c])

        os.remove(self._X_train_path)
        os.remove(self._y_train_path)

    def fit(self, X_train, y_train, X_validation=None, y_validation=None):
        """
        Fit AutoML
        
        :param X_train: Pandas DataFrame with training data.
        :param y_train: Numpy Array with target training data.
        
        :param X_validation: Pandas DataFrame with validation data. (Not implemented yet)
        :param y_validation: Numpy Array with target of validation data. (Not implemented yet)
        
        """
        try:

            if self._best_model is not None:
                print(
                    "Best model is already set, no need to run fit. Skipping ..."
                )
                return

            self._start_time = time.time()

            if not isinstance(X_train, pd.DataFrame):
                raise AutoMLException(
                    "AutoML needs X_train matrix to be a Pandas DataFrame")

            self._set_ml_task(y_train)

            if X_train is not None:
                X_train = X_train.copy(deep=False)

            X_train, y_train, X_validation, y_validation = self._initial_prep(
                X_train, y_train, X_validation, y_validation)
            self._save_data(X_train, y_train, X_validation, y_validation)

            self._set_algorithms()
            self._set_metric()
            # self._estimate_training_times()

            if self._ml_task in [
                    BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION
            ]:
                self._check_imbalanced(y_train)

            tuner = MljarTuner(
                self._tuner_params,
                self._algorithms,
                self._ml_task,
                self._validation,
                self._explain_level,
                self._data_info,
                self._seed,
            )
            self.tuner = tuner
            self._time_spend = {}
            self._time_start = {}

            # 1. Check simple algorithms
            self._fit_level = "simple_algorithms"
            start = time.time()
            self._time_start[self._fit_level] = start
            for params in tuner.simple_algorithms_params():
                self.train_model(params)
            self._time_spend["simple_algorithms"] = np.round(
                time.time() - start, 2)

            # 2. Default parameters
            self._fit_level = "default_algorithms"
            start = time.time()
            self._time_start[self._fit_level] = start
            for params in tuner.default_params(len(self._models)):
                self.train_model(params)
            self._time_spend["default_algorithms"] = np.round(
                time.time() - start, 2)

            # 3. The not-so-random step
            self._fit_level = "not_so_random"
            start = time.time()
            self._time_start[self._fit_level] = start
            generated_params = tuner.get_not_so_random_params(len(
                self._models))
            for params in generated_params:
                self.train_model(params)
            self._time_spend["not_so_random"] = np.round(
                time.time() - start, 2)

            # 4. The hill-climbing step
            self._fit_level = "hill_climbing"
            start = time.time()
            self._time_start[self._fit_level] = start
            for params in tuner.get_hill_climbing_params(self._models):
                self.train_model(params)
            self._time_spend["hill_climbing"] = np.round(
                time.time() - start, 2)

            # 5. Ensemble unstacked models
            self._fit_level = "ensemble_unstacked"
            start = time.time()
            self._time_start[self._fit_level] = start
            self.ensemble_step()
            self._time_spend["ensemble_unstacked"] = np.round(
                time.time() - start, 2)

            if self._stack:
                # 6. Stack best models
                self._fit_level = "stack"
                start = time.time()
                self._time_start[self._fit_level] = start
                self.stacked_ensemble_step()
                self._time_spend["stack"] = np.round(time.time() - start, 2)

                # 7. Ensemble all models (original and stacked)
                any_stacked = False
                for m in self._models:
                    if m._is_stacked:
                        any_stacked = True
                        break
                if any_stacked:
                    self._fit_level = "ensemble_all"
                    start = time.time()
                    self.ensemble_step(is_stacked=True)
                    self._time_spend["ensemble_all"] = np.round(
                        time.time() - start, 2)

            self._fit_time = time.time() - self._start_time

            logger.info(f"AutoML fit time: {self._fit_time}")

        except Exception as e:
            raise e
        finally:
            if self._X_train_path is not None:
                self._load_data_variables(X_train)

    def select_and_save_best(self):
        max_loss = 10e14
        for i, m in enumerate(self._models):
            if m.get_final_loss() < max_loss:
                self._best_model = m
                max_loss = m.get_final_loss()

        with open(os.path.join(self._results_path, "best_model.txt"),
                  "w") as fout:
            fout.write(f"{self._best_model.get_name()}")

        with open(os.path.join(self._results_path, "params.json"),
                  "w") as fout:
            params = {
                "ml_task": self._ml_task,
                "optimize_metric": self._optimize_metric,
                "saved": self._model_paths,
            }
            if self._stacked_models is not None:
                params["stacked"] = [
                    m.get_name() for m in self._stacked_models
                ]
            fout.write(json.dumps(params, indent=4))

        ldb = self.get_leaderboard()
        ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"),
                   index=False)

        # save report
        ldb["Link"] = [
            f"[Results link]({m}/README.md)" for m in ldb["name"].values
        ]
        ldb.insert(loc=0, column="Best model", value="")
        ldb.loc[ldb.name == self._best_model.get_name(),
                "Best model"] = "**the best**"

        with open(os.path.join(self._results_path, "README.md"), "w") as fout:
            fout.write(f"# AutoML Leaderboard\n\n")
            fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe"))
            LeaderboardPlots.compute(ldb, self._results_path, fout)

    def predict(self, X):
        """
        Computes predictions from AutoML best model.

        :param X: The Pandas DataFrame with input data. The input data should have the same columns as data used for training, otherwise the `AutoMLException` will be raised.
        """
        if self._best_model is None:
            return None

        if not isinstance(X.columns[0], str):
            X.columns = [str(c) for c in X.columns]

        input_columns = X.columns.tolist()
        for column in self._data_info["columns"]:
            if column not in input_columns:
                raise AutoMLException(
                    f"Missing column: {column} in input data. Cannot predict")
        X = X[self._data_info["columns"]]

        # is stacked model
        if self._best_model._is_stacked:
            self.stack_models()
            X_stacked = self.get_stacked_data(X, mode="predict")

            if self._best_model.get_type() == "Ensemble":
                # Ensemble is using both original and stacked data
                predictions = self._best_model.predict(X, X_stacked)
            else:
                predictions = self._best_model.predict(X_stacked)
        else:
            predictions = self._best_model.predict(X)

        if self._ml_task == BINARY_CLASSIFICATION:
            # need to predict the label based on predictions and threshold
            neg_label, pos_label = (
                predictions.columns[0][11:],
                predictions.columns[1][11:],
            )

            if neg_label == "0" and pos_label == "1":
                neg_label, pos_label = 0, 1
            target_is_numeric = self._data_info.get("target_is_numeric", False)
            if target_is_numeric:
                neg_label = int(neg_label)
                pos_label = int(pos_label)
            # assume that it is binary classification
            predictions[
                "label"] = predictions.iloc[:, 1] > self._best_model._threshold
            predictions["label"] = predictions["label"].map({
                True: pos_label,
                False: neg_label
            })
            return predictions
        elif self._ml_task == MULTICLASS_CLASSIFICATION:
            target_is_numeric = self._data_info.get("target_is_numeric", False)
            if target_is_numeric:
                predictions["label"] = predictions["label"].astype(int)
            return predictions
        else:
            return predictions

    def to_json(self):
        if self._best_model is None:
            return None

        return {
            "best_model": self._best_model.to_json(),
            "threshold": self._threshold,
            "ml_task": self._ml_task,
        }

    def from_json(self, json_data):

        if json_data["best_model"]["algorithm_short_name"] == "Ensemble":
            self._best_model = Ensemble()
            self._best_model.from_json(json_data["best_model"])
        else:
            self._best_model = ModelFramework(
                json_data["best_model"].get("params"))
            self._best_model.from_json(json_data["best_model"])
        self._threshold = json_data.get("threshold")

        self._ml_task = json_data.get("ml_task")
Exemplo n.º 4
0
class AutoML:
    """
    Automated Machine Learning for supervised tasks (binary classification, multiclass classification, regression).
    """

    def __init__(
        self,
        results_path=None,
        total_time_limit=60 * 60,
        model_time_limit=None,
        algorithms=["Random Forest", "Xgboost"],
        tuning_mode="Sport",
        train_ensemble=True,
        optimize_metric=None,
        validation={"validation_type": "kfold", "k_folds": 5, "shuffle": True},
        verbose=True,
        ml_task=None,
        seed=1,
    ):
        """
        Create the AutoML object. Initialize directory for results.

        :param results_path: The path where all results will be saved. 
        If left `None` then the name of directory will be generated, with schema: AutoML_{number},
        where number can be from 1 to 100 - depends which direcory name will be available.

        If the `results_path` will point to directory with AutoML results, then all models will be loaded.
        
        :param total_time_limit: The time limit in seconds for AutoML training. It is not used when `model_time_limit` is not `None`.
        
        :param model_time_limit: The time limit in seconds for training single model. 
        If `model_time_limit` is set, the `total_time_limit` is not respected. 
        Single model can contain several learners, for example in the case of 10-fold cross-validation, one model will have 10 learners.
        Based on `model_time_limit` the time limit for single learner is computed.
        
        :param algorithms: The list of algorithms that will be used in the training.
        
        :param tuning_mode: The mode for tuning. It can be: `Normal`, `Sport`, `Insane`, `Perfect`. The names are kept the same as in https://mljar.com application.
        
        Each mode describe how many models will be checked:
        
        - `Normal` - about 5-10 models of each algorithm will be trained,
        - `Sport` - about 10-15 models of each algorithm will be trained,
        - `Insane` - about 15-20 models of each algorithm will be trained,
        - `Perfect` - about 25-35 models of each algorithm will be trained.
        
        You can also set how many models will be trained with `set_advanced` method.
        
        :param train_ensemble: If true then at the end of models training the ensemble will be created.
        
        :param optimize_metric: The metric to be optimized. (not implemented yet, please left `None`)
        
        :param validation: The JSON with validation type. Right now only Cross-Validation is supported. 
        The example JSON parameters for validation:
        ```
        {"validation_type": "kfold", "k_folds": 5, "shuffle": True, "stratify": True, "random_seed": 123}
        ```
        :param verbose: Not implemented yet.
        :param ml_task: The machine learning task that will be solved. Can be: `"binary_classification", "multiclass_classification", "regression"`.
        If left `None` AutoML will try to guess the task based on target values. 
        If there will be only 2 values in the target, then task will be set to `"binary_classification"`.
        If number of values in the target will be between 2 and 20 (included), then task will be set to `"multiclass_classification"`.
        In all other casses, the task is set to `"regression"`.
        
        :param seed: The seed for random generator.
        
        """
        logger.debug("AutoML.__init__")

        # total_time_limit is the time for computing for all models
        # model_time_limit is the time for computing a single model
        # if model_time_limit is None then its value is computed from total_time_limit
        # if total_time_limit is set and model_time_limit is set, then total_time_limit constraint will be omitted
        self._total_time_limit = total_time_limit
        self._model_time_limit = model_time_limit
        # time limit in seconds for single learner (model consists of learners)
        # the value is computed before fit, initilize with any number
        self._time_limit = 1

        self._train_ensemble = train_ensemble
        self._models = []  # instances of iterative learner framework or ensemble

        # it is instance of model framework or ensemble
        self._best_model = None
        self._validation = validation
        self.set_tuning_mode("Sport")

        self._algorithms = algorithms
        self._verbose = verbose

        self._fit_time = None
        self._models_train_time = {}
        self._threshold, self._metrics_details, self._max_metrics, self._confusion_matrix = (
            None,
            None,
            None,
            None,
        )
        self._seed = seed
        self._user_set_optimize_metric = optimize_metric
        self._ml_task = ml_task
        self._tuner_params = {
            "start_random_models": self._start_random_models,
            "hill_climbing_steps": self._hill_climbing_steps,
            "top_models_to_improve": self._top_models_to_improve,
        }

        self._X_train_path, self._y_train_path = None, None
        self._X_validation_path, self._y_validation_path = None, None

        self._data_info = None
        self._model_paths = []

        self._results_path = results_path
        self._set_results_dir()

    def set_tuning_mode(self, mode="Normal"):
        if mode == "Sport":
            self._start_random_models = 10
            self._hill_climbing_steps = 2
            self._top_models_to_improve = 3
        if mode == "Insane":
            self._start_random_models = 15
            self._hill_climbing_steps = 3
            self._top_models_to_improve = 4
        if mode == "Perfect":
            self._start_random_models = 25
            self._hill_climbing_steps = 5
            self._top_models_to_improve = 5
        else:  # Normal
            self._start_random_models = 5
            self._hill_climbing_steps = 1
            self._top_models_to_improve = 2

    def set_advanced(
        self, start_random_models=1, hill_climbing_steps=0, top_models_to_improve=0
    ):
        """
        Advanced set of tuning parameters. 

        :param start_random_models: Number of not-so-random models to check for each algorithm.
        :param hill_climbing_steps: Number of hill climbing steps during tuning.
        :param top_models_to_improve: Number of top models (of each algorithm) which will be considered for improving in hill climbing steps.
        """
        self._start_random_models = start_random_models
        self._hill_climbing_steps = hill_climbing_steps
        self._top_models_to_improve = top_models_to_improve

    def _set_results_dir(self):
        if self._results_path is None:
            found = False
            for i in range(1, 101):
                self._results_path = f"AutoML_{i}"
                if not os.path.exists(self._results_path):
                    found = True
                    break
            if not found:
                raise AutoMLException("Cannot create directory for AutoML results")

        if os.path.exists(self._results_path) and os.path.exists(
            os.path.join(self._results_path, "params.json")
        ):
            print(f"Directory {self._results_path} already exists")
            self.load()
        elif self._results_path is not None:

            if not os.path.exists(self._results_path):
                print(f"Create directory {self._results_path}")
                try:
                    os.mkdir(self._results_path)
                except Exception as e:
                    raise AutoMLException(
                        f"Cannot create directory {self._results_path}"
                    )
            elif os.path.exists(self._results_path) and len(
                os.listdir(self._results_path)
            ):
                raise AutoMLException(
                    f"Cannot set directory for AutoML. Directory {self._results_path} is not empty."
                )
        else:
            raise AutoMLException("Cannot set directory for AutoML results")

    def load(self):
        logger.info("Loading AutoML models ...")
        try:
            params = json.load(open(os.path.join(self._results_path, "params.json")))

            self._model_paths = params["saved"]
            self._ml_task = params["ml_task"]
            self._optimize_metric = params["optimize_metric"]

            models_map = {}
            for model_path in self._model_paths:
                if model_path.endswith("ensemble"):
                    ens = Ensemble.load(model_path, models_map)
                    models_map[ens.get_name()] = ens
                else:
                    m = ModelFramework.load(model_path)
                    self._models += [m]
                    models_map[m.get_name()] = m

            best_model_name = None
            with open(os.path.join(self._results_path, "best_model.txt"), "r") as fin:
                best_model_name = fin.read()

            self._best_model = models_map[best_model_name]

            data_info_path = os.path.join(self._results_path, "data_info.json")
            self._data_info = json.load(open(data_info_path))
        except Exception as e:
            raise AutoMLException(f"Cannot load AutoML directory. {str(e)}")

    def _estimate_training_times(self):
        # single models including models in the folds
        self._estimated_models_to_check = (
            len(self._algorithms) * self._start_random_models
            + self._top_models_to_improve * self._hill_climbing_steps * 2
        )
        if self._model_time_limit is not None:
            k = self._validation.get("k_folds", 1.0)
            self._time_limit = self._model_time_limit / k
        elif self._total_time_limit is not None:
            # set time limit for single model training
            # the 0.85 is safe scale factor, to not exceed time limit
            # scaling is added because number of models to be trained are estimate
            k = self._validation.get("k_folds", 1.0)
            self._time_limit = (
                self._total_time_limit * 0.85 / self._estimated_models_to_check / k
            )
        print(
            f"AutoML will try to check about {int(self._estimated_models_to_check)} models"
        )

    def get_leaderboard(self):
        ldb = {
            "name": [],
            "model_type": [],
            "metric_type": [],
            "metric_value": [],
            "train_time": [],
        }
        for m in self._models:
            ldb["name"] += [m.get_name()]
            ldb["model_type"] += [m.get_type()]
            ldb["metric_type"] += [self._optimize_metric]
            ldb["metric_value"] += [m.get_final_loss()]
            ldb["train_time"] += [np.round(m.get_train_time(), 2)]
        return pd.DataFrame(ldb)

    def get_additional_metrics(self):

        additional_metrics = self._best_model.get_additional_metrics()
        # AdditionalMetrics.compute(
        #    oof_predictions[target_cols],
        #    oof_predictions[prediction_cols],
        #    self._ml_task,
        # )
        if self._ml_task == BINARY_CLASSIFICATION:

            self._metrics_details = additional_metrics["metric_details"]
            self._max_metrics = additional_metrics["max_metrics"]
            self._confusion_matrix = additional_metrics["confusion_matrix"]
            self._threshold = additional_metrics["threshold"]
            logger.info(
                "Metric details:\n{}\n\nConfusion matrix:\n{}".format(
                    self._max_metrics.transpose(), self._confusion_matrix
                )
            )
            with open(
                os.path.join(self._results_path, "best_model_metrics.txt"), "w"
            ) as fout:
                fout.write(
                    "Metric details:\n{}\n\nConfusion matrix:\n{}".format(
                        self._max_metrics.transpose(), self._confusion_matrix
                    )
                )

        elif self._ml_task == MULTICLASS_CLASSIFICATION:

            max_metrics = additional_metrics["max_metrics"]
            confusion_matrix = additional_metrics["confusion_matrix"]

            logger.info(
                "Metric details:\n{}\nConfusion matrix:\n{}".format(
                    max_metrics, confusion_matrix
                )
            )
            with open(
                os.path.join(self._results_path, "best_model_metrics.txt"), "w"
            ) as fout:
                fout.write("Metric details:\n{}\n\n".format(max_metrics.transpose()))
                fout.write("Confusion matrix:\n{}".format(confusion_matrix))

    def keep_model(self, model):
        if model is None:
            return
        self._models += [model]
        self.verbose_print(
            "{} final {} {} time {} seconds".format(
                model.get_type(),
                self._optimize_metric,
                model.get_final_loss(),
                np.round(model.get_train_time(), 2),
            )
        )
        self.log_train_time(model.get_type(), model.get_train_time())

    def train_model(self, params):

        model_path = os.path.join(self._results_path, params["name"])

        early_stop = EarlyStopping(
            {"metric": {"name": self._optimize_metric}, "log_to_dir": model_path}
        )
        time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit})
        mf = ModelFramework(params, callbacks=[early_stop, time_constraint])

        if self._enough_time_to_train(mf.get_type()):

            logger.info(
                f"Train model #{len(self._models)+1} / Model name: {params['name']}"
            )

            try:
                os.mkdir(model_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {model_path}")

            mf.train()  # {"train": {"X": X, "y": y}})

            mf.save(model_path)
            self._model_paths += [model_path]

            self.keep_model(mf)

        else:
            logger.info(
                f"Cannot check more models of {mf.get_type()} because of time constraint"
            )
        # self._progress_bar.update(1)

    def verbose_print(self, msg):
        if self._verbose:
            # self._progress_bar.write(msg)
            print(msg)

    def log_train_time(self, model_type, train_time):
        if model_type in self._models_train_time:
            self._models_train_time[model_type] += [train_time]
        else:
            self._models_train_time[model_type] = [train_time]

    def _enough_time_to_train(self, model_type):
        # if model_time_limit is set, train every model
        # do not apply total_time_limit
        if self._model_time_limit is not None:
            return True
        # no total time limit, just train, dont ask
        if self._total_time_limit is None:
            return True

        total_time_already_spend = (
            0
            if model_type not in self._models_train_time
            else np.sum(self._models_train_time[model_type])
        )
        mean_time_already_spend = (
            0
            if model_type not in self._models_train_time
            else np.mean(self._models_train_time[model_type])
        )

        if (
            total_time_already_spend + mean_time_already_spend
            < 0.85 * self._total_time_limit / float(len(self._algorithms))
        ):
            return True
        return False

    def ensemble_step(self):
        if self._train_ensemble:
            self.ensemble = Ensemble(self._optimize_metric, self._ml_task)
            oofs, target = self.ensemble.get_oof_matrix(self._models)
            self.ensemble.fit(oofs, target)
            self.keep_model(self.ensemble)

            ensemble_path = os.path.join(self._results_path, "ensemble")
            try:
                os.mkdir(ensemble_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {ensemble_path}")
            self.ensemble.save(ensemble_path)
            self._model_paths += [ensemble_path]

    def _set_ml_task(self, y):
        """ Set and validate the ML task.
        
        If ML task is not set, it trys to guess ML task based on count of unique values in the target. 
        Then it performs validation.
        """
        # if not set, guess
        if self._ml_task is None:
            target_unique_cnt = len(np.unique(y[~pd.isnull(y)]))
            if target_unique_cnt == 2:
                self._ml_task = BINARY_CLASSIFICATION
            elif target_unique_cnt <= 20:
                self._ml_task = MULTICLASS_CLASSIFICATION
            else:
                self._ml_task = REGRESSION
        # validation
        if self._ml_task not in AlgorithmsRegistry.get_supported_ml_tasks():
            raise Exception(
                "Unknow Machine Learning task {}."
                " Supported tasks are: {}".format(
                    self._ml_task, AlgorithmsRegistry.get_supported_ml_tasks()
                )
            )
        logger.info("AutoML task to be solved: {}".format(self._ml_task))
        print(f"AutoML task to be solved: { self._ml_task}")

    def _set_algorithms(self):
        """ Set and validate available algorithms.

        If algorithms are not set, all algorithms from registry are used.
        Then perform vadlidation of algorithms.
        """
        if len(self._algorithms) == 0:
            self._algorithms = list(AlgorithmsRegistry.registry[self._ml_task].keys())

        for a in self._algorithms:
            if a not in list(AlgorithmsRegistry.registry[self._ml_task].keys()):
                raise AutoMLException(
                    "The algorithm {} is not allowed to use for ML task: {}. Allowed algorithms: {}".format(
                        a,
                        self._ml_task,
                        list(AlgorithmsRegistry.registry[self._ml_task].keys()),
                    )
                )
        logger.info("AutoML will use algorithms: {}".format(self._algorithms))
        print(f"AutoML will use algorithms: {self._algorithms}")

    def _set_metric(self):
        """ Set and validate the metric to be optimized. """
        if self._ml_task == BINARY_CLASSIFICATION:
            if self._user_set_optimize_metric is None:
                self._optimize_metric = "logloss"
            elif self._user_set_optimize_metric not in ["logloss", "auc"]:
                raise AutoMLException(
                    "Metric {} is not allowed in ML task: {}".format(
                        self._user_set_optimize_metric, self._ml_task
                    )
                )
            else:
                self._optimize_metric = self._user_set_optimize_metric
        elif self._ml_task == MULTICLASS_CLASSIFICATION:
            if self._user_set_optimize_metric is None:
                self._optimize_metric = "logloss"
            elif self._user_set_optimize_metric not in ["logloss"]:
                raise AutoMLException(
                    "Metric {} is not allowed in ML task: {}".format(
                        self._user_set_optimize_metric, self._ml_task
                    )
                )
            else:
                self._optimize_metric = self._user_set_optimize_metric
        elif self._ml_task == REGRESSION:
            if self._user_set_optimize_metric is None:
                self._optimize_metric = "mse"
            elif self._user_set_optimize_metric not in ["mse"]:
                raise AutoMLException(
                    "Metric {} is not allowed in ML task: {}".format(
                        self._user_set_optimize_metric, self._ml_task
                    )
                )
            else:
                self._optimize_metric = self._user_set_optimize_metric
        logger.info(
            "AutoML will optimize for metric: {0}".format(self._optimize_metric)
        )
        print(f"AutoML will optimize for metric: {self._optimize_metric}")

    def _check_imbalanced(self, y):
        v = y.value_counts()
        # at least 10 samples of each class
        ii = v < 10
        if np.sum(ii):
            raise AutoMLException(
                f"There need to be at least 10 samples of each class, for class {list(v[ii].index)} there is {v[ii].values} samples"
            )
        # at least 1% of all samples for each class
        v = y.value_counts(normalize=True) * 100.0
        ii = v < 1.0
        if np.sum(ii):
            raise AutoMLException(
                f"There need to be at least 1% of samples of each class, for class {list(v[ii].index)} there is {v[ii].values} % of samples"
            )

    def _initial_prep(self, X_train, y_train, X_validation=None, y_validation=None):

        if not isinstance(X_train, pd.DataFrame):
            X_train = pd.DataFrame(X_train)

        if not isinstance(X_train.columns[0], str):
            X_train.columns = [str(c) for c in X_train.columns]

        X_train.reset_index(drop=True, inplace=True)

        y_train = pd.Series(np.array(y_train), name="target")

        X_train, y_train = ExcludeRowsMissingTarget.transform(
            X_train, y_train, warn=True
        )

        return X_train, y_train, X_validation, y_validation

    def _save_data(self, X_train, y_train, X_validation=None, y_validation=None):

        self._X_train_path = os.path.join(self._results_path, "X_train.parquet")
        self._y_train_path = os.path.join(self._results_path, "y_train.parquet")

        X_train.to_parquet(self._X_train_path, index=False)

        pd.DataFrame({"target": y_train}).to_parquet(self._y_train_path, index=False)

        self._validation["X_train_path"] = self._X_train_path
        self._validation["y_train_path"] = self._y_train_path
        self._validation["results_path"] = self._results_path

        self._data_info = {
            "columns": X_train.columns.tolist(),
            "rows": X_train.shape[0],
            "cols": X_train.shape[1],
            "target_is_numeric": pd.api.types.is_numeric_dtype(y_train),
        }
        data_info_path = os.path.join(self._results_path, "data_info.json")
        with open(data_info_path, "w") as fout:
            fout.write(json.dumps(self._data_info, indent=4))

    def _del_data_variables(self, X_train, y_train):

        X_train.drop(X_train.columns, axis=1, inplace=True)

    def _load_data_variables(self, X_train):
        X = pd.read_parquet(self._X_train_path)

        for c in X.columns:
            X_train.insert(loc=X_train.shape[1], column=c, value=X[c])

        os.remove(self._X_train_path)
        os.remove(self._y_train_path)

    def fit(self, X_train, y_train, X_validation=None, y_validation=None):
        """
        Fit AutoML
        
        :param X_train: Pandas DataFrame with training data.
        :param y_train: Numpy Array with target training data.
        
        :param X_validation: Pandas DataFrame with validation data. (Not implemented yet)
        :param y_validation: Numpy Array with target of validation data. (Not implemented yet)
        
        """
        try:
            if self._best_model is not None:
                print("Best model is already set, no need to run fit. Skipping ...")
                return

            start_time = time.time()
            if not isinstance(X_train, pd.DataFrame):
                raise AutoMLException(
                    "AutoML needs X_train matrix to be a Pandas DataFrame"
                )

            if X_train is not None:
                X_train = X_train.copy(deep=False)

            X_train, y_train, X_validation, y_validation = self._initial_prep(
                X_train, y_train, X_validation, y_validation
            )
            self._save_data(X_train, y_train, X_validation, y_validation)

            self._set_ml_task(y_train)
            self._set_algorithms()
            self._set_metric()
            self._estimate_training_times()

            if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]:
                self._check_imbalanced(y_train)

            tuner = MljarTuner(
                self._tuner_params,
                self._algorithms,
                self._ml_task,
                self._validation,
                self._seed,
            )

            # not so random step
            generated_params = tuner.get_not_so_random_params(X_train, y_train)
            self._del_data_variables(X_train, y_train)

            for params in generated_params:
                self.train_model(params)
            # hill climbing
            for params in tuner.get_hill_climbing_params(self._models):
                self.train_model(params)

            self.ensemble_step()

            max_loss = 10e12
            for i, m in enumerate(self._models):
                if m.get_final_loss() < max_loss:
                    self._best_model = m
                    max_loss = m.get_final_loss()

            self.get_additional_metrics()
            self._fit_time = time.time() - start_time
            # self._progress_bar.close()

            with open(os.path.join(self._results_path, "best_model.txt"), "w") as fout:
                fout.write(f"{self._best_model.get_name()}")

            with open(os.path.join(self._results_path, "params.json"), "w") as fout:
                params = {
                    "ml_task": self._ml_task,
                    "optimize_metric": self._optimize_metric,
                    "saved": self._model_paths,
                }
                fout.write(json.dumps(params, indent=4))

            ldb = self.get_leaderboard()
            ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False)

            # save report
            ldb["Link"] = [f"[Results link]({m}/README.md)" for m in ldb["name"].values]
            ldb.insert(loc=0, column="Best model", value="")
            ldb.loc[
                ldb.name == self._best_model.get_name(), "Best model"
            ] = "*** the best ***"
            with open(os.path.join(self._results_path, "README.md"), "w") as fout:
                fout.write(f"# AutoML Leaderboard\n\n")
                fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe"))
        except Exception as e:
            raise e
        finally:
            if self._X_train_path is not None:
                self._load_data_variables(X_train)

    def predict(self, X):
        """
        Computes predictions from AutoML best model.

        :param X: The Pandas DataFrame with input data. The input data should have the same columns as data used for training, otherwise the `AutoMLException` will be raised.
        """
        if self._best_model is None:
            return None

        if not isinstance(X.columns[0], str):
            X.columns = [str(c) for c in X.columns]

        input_columns = X.columns.tolist()
        for column in self._data_info["columns"]:
            if column not in input_columns:
                raise AutoMLException(
                    f"Missing column: {column} in input data. Cannot predict"
                )
        X = X[self._data_info["columns"]]

        predictions = self._best_model.predict(X)

        if self._ml_task == BINARY_CLASSIFICATION:
            # need to predict the label based on predictions and threshold
            neg_label, pos_label = (
                predictions.columns[0][11:],
                predictions.columns[1][11:],
            )

            if neg_label == "0" and pos_label == "1":
                neg_label, pos_label = 0, 1
            target_is_numeric = self._data_info.get("target_is_numeric", False)
            if target_is_numeric:
                neg_label = int(neg_label)
                pos_label = int(pos_label)
            # assume that it is binary classification
            predictions["label"] = predictions.iloc[:, 1] > self._best_model._threshold
            predictions["label"] = predictions["label"].map(
                {True: pos_label, False: neg_label}
            )
            return predictions
        elif self._ml_task == MULTICLASS_CLASSIFICATION:
            target_is_numeric = self._data_info.get("target_is_numeric", False)
            if target_is_numeric:
                predictions["label"] = predictions["label"].astype(int)
            return predictions
        else:
            return predictions

    def to_json(self):
        if self._best_model is None:
            return None

        return {
            "best_model": self._best_model.to_json(),
            "threshold": self._threshold,
            "ml_task": self._ml_task,
        }

    def from_json(self, json_data):

        if json_data["best_model"]["algorithm_short_name"] == "Ensemble":
            self._best_model = Ensemble()
            self._best_model.from_json(json_data["best_model"])
        else:
            self._best_model = ModelFramework(json_data["best_model"].get("params"))
            self._best_model.from_json(json_data["best_model"])
        self._threshold = json_data.get("threshold")

        self._ml_task = json_data.get("ml_task")
Exemplo n.º 5
0
class AutoML:
    def __init__(
        self,
        results_path=None,
        total_time_limit=60 * 60,
        algorithms=["Random Forest", "Xgboost"],  # , "Random Forest"],
        start_random_models=10,
        hill_climbing_steps=3,
        top_models_to_improve=5,
        train_ensemble=True,
        verbose=True,
        optimize_metric=None,
        ml_task=None,
        seed=1,
    ):
        logger.debug("AutoML.__init__")

        self._total_time_limit = total_time_limit
        # time limit in seconds for single learner
        self._time_limit = 1  # wtf

        self._train_ensemble = train_ensemble
        self._models = []  # instances of iterative learner framework or ensemble

        # it is instance of model framework or ensemble
        self._best_model = None
        # default validation
        self._validation = {"validation_type": "kfold", "k_folds": 5, "shuffle": True}
        self._start_random_models = start_random_models
        self._hill_climbing_steps = hill_climbing_steps
        self._top_models_to_improve = top_models_to_improve
        self._algorithms = algorithms
        self._verbose = verbose

        self._fit_time = None
        self._models_train_time = {}
        self._threshold, self._metrics_details, self._max_metrics, self._confusion_matrix = (
            None,
            None,
            None,
            None,
        )
        self._seed = seed
        self._user_set_optimize_metric = optimize_metric
        self._ml_task = ml_task
        self._tuner_params = {
            "start_random_models": self._start_random_models,
            "hill_climbing_steps": self._hill_climbing_steps,
            "top_models_to_improve": self._top_models_to_improve,
        }

        self._X_train_path, self._y_train_path = None, None
        self._X_validation_path, self._y_validation_path = None, None

        self._data_info = None
        self._model_paths = []

        self._results_path = results_path
        self._set_results_dir()

    def _set_results_dir(self):
        if self._results_path is None:
            found = False
            for i in range(1, 101):
                self._results_path = f"AutoML_{i}"
                if not os.path.exists(self._results_path):
                    found = True
                    break
            if not found:
                raise AutoMLException("Cannot create directory for AutoML results")

        if os.path.exists(self._results_path):
            print(f"Directory {self._results_path} already exists")
            self.load()
        elif self._results_path is not None:
            print(f"Create directory {self._results_path}")
            try:
                os.mkdir(self._results_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {self._results_path}")

    def load(self):
        logger.info("Loading AutoML models ...")

        params = json.load(open(os.path.join(self._results_path, "params.json")))

        self._model_paths = params["saved"]
        self._ml_task = params["ml_task"]
        self._optimize_metric = params["optimize_metric"]

        models_map = {}
        for model_path in self._model_paths:
            if model_path.endswith("ensemble"):
                ens = Ensemble.load(model_path, models_map)
                models_map[ens.get_name()] = ens
            else:
                m = ModelFramework.load(model_path)
                self._models += [m]
                models_map[m.get_name()] = m

        best_model_name = None
        with open(os.path.join(self._results_path, "best_model.txt"), "r") as fin:
            best_model_name = fin.read()

        self._best_model = models_map[best_model_name]

        data_info_path = os.path.join(self._results_path, "data_info.json")
        self._data_info = json.load(open(data_info_path))
        print("data info", self._data_info)

    def _estimate_training_times(self):
        # single models including models in the folds
        self._estimated_models_to_check = (
            len(self._algorithms) * self._start_random_models
            + self._top_models_to_improve * self._hill_climbing_steps * 2
        )

        if self._total_time_limit is not None:
            # set time limit for single model training
            # the 0.85 is safe scale factor, to not exceed time limit
            k = self._validation.get("k_folds", 1.0)
            self._time_limit = (
                self._total_time_limit * 0.85 / self._estimated_models_to_check / k
            )
        print(
            f"AutoML will try to check about {int(self._estimated_models_to_check)} models"
        )

    def get_leaderboard(self):
        ldb = {
            "name": [],
            "model_type": [],
            "metric_type": [],
            "metric_value": [],
            "train_time": [],
        }
        for m in self._models:
            ldb["name"] += [m.get_name()]
            ldb["model_type"] += [m.get_type()]
            ldb["metric_type"] += [self._optimize_metric]
            ldb["metric_value"] += [m.get_final_loss()]
            ldb["train_time"] += [np.round(m.get_train_time(), 2)]
        return pd.DataFrame(ldb)

    def get_additional_metrics(self):
        # 'target' - the target after processing used for model training
        # 'prediction' - out of folds predictions of the model
        # oof_predictions = self._best_model.get_out_of_folds()
        # prediction_cols = [c for c in oof_predictions.columns if "prediction" in c]
        # target_cols = [c for c in oof_predictions.columns if "target" in c]

        additional_metrics = self._best_model.get_additional_metrics()
        # AdditionalMetrics.compute(
        #    oof_predictions[target_cols],
        #    oof_predictions[prediction_cols],
        #    self._ml_task,
        # )
        if self._ml_task == BINARY_CLASSIFICATION:

            self._metrics_details = additional_metrics["metric_details"]
            self._max_metrics = additional_metrics["max_metrics"]
            self._confusion_matrix = additional_metrics["confusion_matrix"]
            self._threshold = additional_metrics["threshold"]
            logger.info(
                "Metric details:\n{}\n\nConfusion matrix:\n{}".format(
                    self._max_metrics.transpose(), self._confusion_matrix
                )
            )
            with open(
                os.path.join(self._results_path, "best_model_metrics.txt"), "w"
            ) as fout:
                fout.write(
                    "Metric details:\n{}\n\nConfusion matrix:\n{}".format(
                        self._max_metrics.transpose(), self._confusion_matrix
                    )
                )

        elif self._ml_task == MULTICLASS_CLASSIFICATION:

            max_metrics = additional_metrics["max_metrics"]
            confusion_matrix = additional_metrics["confusion_matrix"]

            logger.info(
                "Metric details:\n{}\nConfusion matrix:\n{}".format(
                    max_metrics, confusion_matrix
                )
            )
            with open(
                os.path.join(self._results_path, "best_model_metrics.txt"), "w"
            ) as fout:
                fout.write("Metric details:\n{}\n\n".format(max_metrics.transpose()))
                fout.write("Confusion matrix:\n{}".format(confusion_matrix))

    def keep_model(self, model):
        if model is None:
            return
        self._models += [model]
        self.verbose_print(
            "{} final {} {} time {} seconds".format(
                model.get_type(),
                self._optimize_metric,
                model.get_final_loss(),
                np.round(model.get_train_time(), 2),
            )
        )
        self.log_train_time(model.get_type(), model.get_train_time())

    def train_model(self, params):
        model_path = os.path.join(self._results_path, f"model_{len(self._models)+1}")

        early_stop = EarlyStopping(
            {"metric": {"name": self._optimize_metric}, "log_to_dir": model_path}
        )
        time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit})
        mf = ModelFramework(params, callbacks=[early_stop, time_constraint])

        if self._enough_time_to_train(mf.get_type()):
            logger.info(f"Train model #{len(self._models)+1}")

            try:
                os.mkdir(model_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {model_path}")

            mf.train()  # {"train": {"X": X, "y": y}})

            mf.save(model_path)
            self._model_paths += [model_path]

            self.keep_model(mf)

        else:
            logger.info(
                f"Cannot check more models of {mf.get_type()} because of time constraint"
            )
        # self._progress_bar.update(1)

    def verbose_print(self, msg):
        if self._verbose:
            # self._progress_bar.write(msg)
            print(msg)

    def log_train_time(self, model_type, train_time):
        if model_type in self._models_train_time:
            self._models_train_time[model_type] += [train_time]
        else:
            self._models_train_time[model_type] = [train_time]

    def _enough_time_to_train(self, model_type):
        # no time limit, just train, dont ask
        if self._total_time_limit is None:
            return True

        total_time_already_spend = (
            0
            if model_type not in self._models_train_time
            else np.sum(self._models_train_time[model_type])
        )
        mean_time_already_spend = (
            0
            if model_type not in self._models_train_time
            else np.mean(self._models_train_time[model_type])
        )

        if (
            total_time_already_spend + mean_time_already_spend
            < 0.85 * self._total_time_limit / float(len(self._algorithms))
        ):
            return True
        return False

    def ensemble_step(self):
        if self._train_ensemble:
            self.ensemble = Ensemble(self._optimize_metric, self._ml_task)
            oofs, target = self.ensemble.get_oof_matrix(self._models)
            self.ensemble.fit(oofs, target)
            self.keep_model(self.ensemble)

            ensemble_path = os.path.join(self._results_path, "ensemble")
            try:
                os.mkdir(ensemble_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {ensemble_path}")
            self.ensemble.save(ensemble_path)
            self._model_paths += [ensemble_path]

    def _set_ml_task(self, y):
        """ Set and validate the ML task.
        
        If ML task is not set, it trys to guess ML task based on count of unique values in the target. 
        Then it performs validation.
        """
        # if not set, guess
        if self._ml_task is None:
            target_unique_cnt = len(np.unique(y[~pd.isnull(y)]))
            if target_unique_cnt == 2:
                self._ml_task = BINARY_CLASSIFICATION
            elif target_unique_cnt <= 20:
                self._ml_task = MULTICLASS_CLASSIFICATION
            else:
                self._ml_task = REGRESSION
        # validation
        if self._ml_task not in AlgorithmsRegistry.get_supported_ml_tasks():
            raise Exception(
                "Unknow Machine Learning task {}."
                " Supported tasks are: {}".format(
                    self._ml_task, AlgorithmsRegistry.get_supported_ml_tasks()
                )
            )
        logger.info("AutoML task to be solved: {}".format(self._ml_task))
        print(f"AutoML task to be solved: { self._ml_task}")

    def _set_algorithms(self):
        """ Set and validate available algorithms.

        If algorithms are not set, all algorithms from registry are used.
        Then perform vadlidation of algorithms.
        """
        if len(self._algorithms) == 0:
            self._algorithms = list(AlgorithmsRegistry.registry[self._ml_task].keys())

        for a in self._algorithms:
            if a not in list(AlgorithmsRegistry.registry[self._ml_task].keys()):
                raise AutoMLException(
                    "The algorithm {} is not allowed to use for ML task: {}. Allowed algorithms: {}".format(
                        a, self._ml_task, list(AlgorithmsRegistry.registry[self._ml_task].keys())
                    )
                )
        logger.info("AutoML will use algorithms: {}".format(self._algorithms))
        print(f"AutoML will use algorithms: {self._algorithms}")

    def _set_metric(self):
        """ Set and validate the metric to be optimized. """
        if self._ml_task == BINARY_CLASSIFICATION:
            if self._user_set_optimize_metric is None:
                self._optimize_metric = "logloss"
            elif self._user_set_optimize_metric not in ["logloss", "auc"]:
                raise AutoMLException(
                    "Metric {} is not allowed in ML task: {}".format(
                        self._user_set_optimize_metric, self._ml_task
                    )
                )
            else:
                self._optimize_metric = self._user_set_optimize_metric
        elif self._ml_task == MULTICLASS_CLASSIFICATION:
            if self._user_set_optimize_metric is None:
                self._optimize_metric = "logloss"
            elif self._user_set_optimize_metric not in ["logloss"]:
                raise AutoMLException(
                    "Metric {} is not allowed in ML task: {}".format(
                        self._user_set_optimize_metric, self._ml_task
                    )
                )
            else:
                self._optimize_metric = self._user_set_optimize_metric
        elif self._ml_task == REGRESSION:
            if self._user_set_optimize_metric is None:
                self._optimize_metric = "mse"
            elif self._user_set_optimize_metric not in ["mse"]:
                raise AutoMLException(
                    "Metric {} is not allowed in ML task: {}".format(
                        self._user_set_optimize_metric, self._ml_task
                    )
                )
            else:
                self._optimize_metric = self._user_set_optimize_metric
        logger.info(
            "AutoML will optimize for metric: {0}".format(self._optimize_metric)
        )
        print(f"AutoML will optimize for metric: {self._optimize_metric}")

    def _check_imbalanced(self, y):
        v = y.value_counts()
        # at least 10 samples of each class
        ii = v < 10
        if np.sum(ii):
            raise AutoMLException(
                f"There need to be at least 10 samples of each class, for class {list(v[ii].index)} there is {v[ii].values} samples"
            )
        # at least 1% of all samples for each class
        v = y.value_counts(normalize=True) * 100.0
        ii = v < 1.0
        if np.sum(ii):
            raise AutoMLException(
                f"There need to be at least 1% of samples of each class, for class {list(v[ii].index)} there is {v[ii].values} % of samples"
            )

    def _initial_prep(self, X_train, y_train, X_validation=None, y_validation=None):

        if not isinstance(X_train, pd.DataFrame):
            X_train = pd.DataFrame(X_train)

        if not isinstance(X_train.columns[0], str):
            X_train.columns = [str(c) for c in X_train.columns]

        X_train.reset_index(drop=True, inplace=True)

        if not isinstance(y_train, pd.DataFrame):
            y_train = pd.DataFrame({"target": np.array(y_train)})
        else:
            if "target" not in y_train.columns:
                raise AutoMLException("There should be target column in y_train")
        y_train.reset_index(drop=True, inplace=True)

        return X_train, y_train["target"], X_validation, y_validation

    def _save_data(self, X_train, y_train, X_validation=None, y_validation=None):

        self._X_train_path = os.path.join(self._results_path, "X_train.csv")
        self._y_train_path = os.path.join(self._results_path, "y_train.csv")

        X_train.to_parquet(self._X_train_path, index=False)

        pd.DataFrame({"target": y_train}).to_parquet(self._y_train_path, index=False)

        self._validation["X_train_path"] = self._X_train_path
        self._validation["y_train_path"] = self._y_train_path
        self._validation["results_path"] = self._results_path

        self._data_info = {
            "columns": X_train.columns.tolist(),
            "rows": X_train.shape[0],
            "cols": X_train.shape[1],
        }
        data_info_path = os.path.join(self._results_path, "data_info.json")
        with open(data_info_path, "w") as fout:
            fout.write(json.dumps(self._data_info, indent=4))

    def _del_data_variables(self, X_train, y_train):
        
        X_train.drop(X_train.columns, axis=1, inplace=True)
        
    def _load_data_variables(self, X_train):
        X = pd.read_parquet(self._X_train_path)

        for c in X.columns:
            X_train.insert(loc=X_train.shape[1], column=c, value=X[c])
        
        os.remove(self._X_train_path)
        os.remove(self._y_train_path)

    def fit(self, X_train, y_train, X_validation=None, y_validation=None):

        if self._best_model is not None:
            print("Best model is already set, no need to run fit. Skipping ...")
            return

        start_time = time.time()
        if not isinstance(X_train, pd.DataFrame):
            raise AutoMLException(
                "AutoML needs X_train matrix to be a Pandas DataFrame"
            )

        if X_train is not None:
            X_train = X_train.copy(deep=False)

        X_train, y_train, X_validation, y_validation = self._initial_prep(
            X_train, y_train, X_validation, y_validation
        )
        self._save_data(X_train, y_train, X_validation, y_validation)

        self._set_ml_task(y_train)
        self._set_algorithms()
        self._set_metric()
        self._estimate_training_times()

        if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]:
            self._check_imbalanced(y_train)

        tuner = MljarTuner(
            self._tuner_params,
            self._algorithms,
            self._ml_task,
            self._validation,
            self._seed,
        )

        # not so random step
        generated_params = tuner.get_not_so_random_params(X_train, y_train)
        self._del_data_variables(X_train, y_train)

        for params in generated_params:
            self.train_model(params)
        # hill climbing
        for params in tuner.get_hill_climbing_params(self._models):
            self.train_model(params)

        self.ensemble_step()

        max_loss = 10e12
        for i, m in enumerate(self._models):
            if m.get_final_loss() < max_loss:
                self._best_model = m
                max_loss = m.get_final_loss()

        self.get_additional_metrics()
        self._fit_time = time.time() - start_time
        # self._progress_bar.close()

        with open(os.path.join(self._results_path, "best_model.txt"), "w") as fout:
            fout.write(f"{self._best_model.get_name()}")

        with open(os.path.join(self._results_path, "params.json"), "w") as fout:
            params = {
                "ml_task": self._ml_task,
                "optimize_metric": self._optimize_metric,
                "saved": self._model_paths,
            }
            fout.write(json.dumps(params, indent=4))

        ldb = self.get_leaderboard()
        ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False)

        # save report
        ldb["Link"] = [f"[Results link]({m}/README.md)" for m in ldb["name"].values]
        ldb.insert(loc=0, column="Best model", value="")
        ldb["Best model"][ldb.name == self._best_model.get_name()] = "*** the best ***"
        with open(os.path.join(self._results_path, "README.md"), "w") as fout:
            fout.write(f"# AutoML Leaderboard\n\n")
            fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe"))


        self._load_data_variables(X_train)

    def predict(self, X):
        if self._best_model is None:
            return None

        if not isinstance(X.columns[0], str):
            X.columns = [str(c) for c in X.columns]

        input_columns = X.columns.tolist()
        for column in self._data_info["columns"]:
            if column not in input_columns:
                raise AutoMLException(
                    f"Missing column: {column} in input data. Cannot predict"
                )
        X = X[self._data_info["columns"]]

        predictions = self._best_model.predict(X)

        if self._ml_task == BINARY_CLASSIFICATION:
            # need to predict the label based on predictions and threshold
            neg_label, pos_label = (
                predictions.columns[0][11:],
                predictions.columns[1][11:],
            )
            if neg_label == "0" and pos_label == "1":
                neg_label, pos_label = 0, 1
            # assume that it is binary classification
            predictions["label"] = predictions.iloc[:, 1] > self._best_model._threshold
            predictions["label"] = predictions["label"].map(
                {True: pos_label, False: neg_label}
            )
            return predictions
        elif self._ml_task == MULTICLASS_CLASSIFICATION:

            return predictions
        else:
            return predictions

    def to_json(self):
        if self._best_model is None:
            return None

        return {
            "best_model": self._best_model.to_json(),
            "threshold": self._threshold,
            "ml_task": self._ml_task,
        }

    def from_json(self, json_data):
        # pretty sure that this can be easily refactored
        if json_data["best_model"]["algorithm_short_name"] == "Ensemble":
            self._best_model = Ensemble()
            self._best_model.from_json(json_data["best_model"])
        else:
            self._best_model = ModelFramework(json_data["best_model"].get("params"))
            self._best_model.from_json(json_data["best_model"])
        self._threshold = json_data.get("threshold")

        self._ml_task = json_data.get("ml_task")