예제 #1
0
    def fit(self, X_train, y_train, X_validation=None, y_validation=None):
        """
        Fit AutoML
        
        :param X_train: Pandas DataFrame with training data.
        :param y_train: Numpy Array with target training data.
        
        :param X_validation: Pandas DataFrame with validation data. (Not implemented yet)
        :param y_validation: Numpy Array with target of validation data. (Not implemented yet)
        
        """
        try:
            if self._best_model is not None:
                print("Best model is already set, no need to run fit. Skipping ...")
                return

            start_time = time.time()
            if not isinstance(X_train, pd.DataFrame):
                raise AutoMLException(
                    "AutoML needs X_train matrix to be a Pandas DataFrame"
                )

            if X_train is not None:
                X_train = X_train.copy(deep=False)

            X_train, y_train, X_validation, y_validation = self._initial_prep(
                X_train, y_train, X_validation, y_validation
            )
            self._save_data(X_train, y_train, X_validation, y_validation)

            self._set_ml_task(y_train)
            self._set_algorithms()
            self._set_metric()
            self._estimate_training_times()

            if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]:
                self._check_imbalanced(y_train)

            tuner = MljarTuner(
                self._tuner_params,
                self._algorithms,
                self._ml_task,
                self._validation,
                self._seed,
            )

            # not so random step
            generated_params = tuner.get_not_so_random_params(X_train, y_train)
            self._del_data_variables(X_train, y_train)

            for params in generated_params:
                self.train_model(params)
            # hill climbing
            for params in tuner.get_hill_climbing_params(self._models):
                self.train_model(params)

            self.ensemble_step()

            max_loss = 10e12
            for i, m in enumerate(self._models):
                if m.get_final_loss() < max_loss:
                    self._best_model = m
                    max_loss = m.get_final_loss()

            self.get_additional_metrics()
            self._fit_time = time.time() - start_time
            # self._progress_bar.close()

            with open(os.path.join(self._results_path, "best_model.txt"), "w") as fout:
                fout.write(f"{self._best_model.get_name()}")

            with open(os.path.join(self._results_path, "params.json"), "w") as fout:
                params = {
                    "ml_task": self._ml_task,
                    "optimize_metric": self._optimize_metric,
                    "saved": self._model_paths,
                }
                fout.write(json.dumps(params, indent=4))

            ldb = self.get_leaderboard()
            ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False)

            # save report
            ldb["Link"] = [f"[Results link]({m}/README.md)" for m in ldb["name"].values]
            ldb.insert(loc=0, column="Best model", value="")
            ldb.loc[
                ldb.name == self._best_model.get_name(), "Best model"
            ] = "*** the best ***"
            with open(os.path.join(self._results_path, "README.md"), "w") as fout:
                fout.write(f"# AutoML Leaderboard\n\n")
                fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe"))
        except Exception as e:
            raise e
        finally:
            if self._X_train_path is not None:
                self._load_data_variables(X_train)
예제 #2
0
    def fit(self, X_train, y_train, X_validation=None, y_validation=None):
        """
        Fit AutoML
        
        :param X_train: Pandas DataFrame with training data.
        :param y_train: Numpy Array with target training data.
        
        :param X_validation: Pandas DataFrame with validation data. (Not implemented yet)
        :param y_validation: Numpy Array with target of validation data. (Not implemented yet)
        
        """
        try:

            if self._best_model is not None:
                print(
                    "Best model is already set, no need to run fit. Skipping ..."
                )
                return

            self._start_time = time.time()

            if not isinstance(X_train, pd.DataFrame):
                raise AutoMLException(
                    "AutoML needs X_train matrix to be a Pandas DataFrame")

            self._set_ml_task(y_train)

            if X_train is not None:
                X_train = X_train.copy(deep=False)

            X_train, y_train, X_validation, y_validation = self._initial_prep(
                X_train, y_train, X_validation, y_validation)
            self._save_data(X_train, y_train, X_validation, y_validation)

            self._set_algorithms()
            self._set_metric()
            # self._estimate_training_times()

            if self._ml_task in [
                    BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION
            ]:
                self._check_imbalanced(y_train)

            tuner = MljarTuner(
                self._tuner_params,
                self._algorithms,
                self._ml_task,
                self._validation,
                self._explain_level,
                self._data_info,
                self._seed,
            )
            self.tuner = tuner
            self._time_spend = {}
            self._time_start = {}

            # 1. Check simple algorithms
            self._fit_level = "simple_algorithms"
            start = time.time()
            self._time_start[self._fit_level] = start
            for params in tuner.simple_algorithms_params():
                self.train_model(params)
            self._time_spend["simple_algorithms"] = np.round(
                time.time() - start, 2)

            # 2. Default parameters
            self._fit_level = "default_algorithms"
            start = time.time()
            self._time_start[self._fit_level] = start
            for params in tuner.default_params(len(self._models)):
                self.train_model(params)
            self._time_spend["default_algorithms"] = np.round(
                time.time() - start, 2)

            # 3. The not-so-random step
            self._fit_level = "not_so_random"
            start = time.time()
            self._time_start[self._fit_level] = start
            generated_params = tuner.get_not_so_random_params(len(
                self._models))
            for params in generated_params:
                self.train_model(params)
            self._time_spend["not_so_random"] = np.round(
                time.time() - start, 2)

            # 4. The hill-climbing step
            self._fit_level = "hill_climbing"
            start = time.time()
            self._time_start[self._fit_level] = start
            for params in tuner.get_hill_climbing_params(self._models):
                self.train_model(params)
            self._time_spend["hill_climbing"] = np.round(
                time.time() - start, 2)

            # 5. Ensemble unstacked models
            self._fit_level = "ensemble_unstacked"
            start = time.time()
            self._time_start[self._fit_level] = start
            self.ensemble_step()
            self._time_spend["ensemble_unstacked"] = np.round(
                time.time() - start, 2)

            if self._stack:
                # 6. Stack best models
                self._fit_level = "stack"
                start = time.time()
                self._time_start[self._fit_level] = start
                self.stacked_ensemble_step()
                self._time_spend["stack"] = np.round(time.time() - start, 2)

                # 7. Ensemble all models (original and stacked)
                any_stacked = False
                for m in self._models:
                    if m._is_stacked:
                        any_stacked = True
                        break
                if any_stacked:
                    self._fit_level = "ensemble_all"
                    start = time.time()
                    self.ensemble_step(is_stacked=True)
                    self._time_spend["ensemble_all"] = np.round(
                        time.time() - start, 2)

            self._fit_time = time.time() - self._start_time

            logger.info(f"AutoML fit time: {self._fit_time}")

        except Exception as e:
            raise e
        finally:
            if self._X_train_path is not None:
                self._load_data_variables(X_train)
예제 #3
0
    def fit(self, X_train, y_train, X_validation=None, y_validation=None):
        """
        Fit AutoML
        
        :param X_train: Pandas DataFrame with training data.
        :param y_train: Numpy Array with target training data.
        
        :param X_validation: Pandas DataFrame with validation data. (Not implemented yet)
        :param y_validation: Numpy Array with target of validation data. (Not implemented yet)
        
        """
        try:

            if self._best_model is not None:
                print(
                    "Best model is already set, no need to run fit. Skipping ..."
                )
                return

            self._start_time = time.time()
            if not isinstance(X_train, pd.DataFrame):
                raise AutoMLException(
                    "AutoML needs X_train matrix to be a Pandas DataFrame")

            if X_train is not None:
                X_train = X_train.copy(deep=False)

            X_train, y_train, X_validation, y_validation = self._initial_prep(
                X_train, y_train, X_validation, y_validation)
            self._save_data(X_train, y_train, X_validation, y_validation)

            self._set_ml_task(y_train)
            self._set_algorithms()
            self._set_metric()
            self._estimate_training_times()

            if self._ml_task in [
                    BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION
            ]:
                self._check_imbalanced(y_train)

            tuner = MljarTuner(
                self._tuner_params,
                self._algorithms,
                self._ml_task,
                self._validation,
                self._explain_level,
                self._seed,
            )

            # not so random step
            generated_params = tuner.get_not_so_random_params(X_train, y_train)
            self._del_data_variables(X_train, y_train)

            # Shuffle generated params
            # do not shuffle Baseline, Linear and Decision Trees
            dont_shuffle = []
            to_shuffle = []
            for p in generated_params:
                if p["learner"]["model_type"] in [
                        "Baseline",
                        "Linear",
                        "Decision Tree",
                ]:
                    dont_shuffle += [p]
                else:
                    to_shuffle += [p]

            np.random.shuffle(to_shuffle)
            generated_params = dont_shuffle + to_shuffle

            for params in generated_params:
                self.train_model(params)
            # hill climbing
            for params in tuner.get_hill_climbing_params(self._models):
                self.train_model(params)

            self.ensemble_step()

            self._fit_time = time.time() - self._start_time

        except Exception as e:
            raise e
        finally:
            if self._X_train_path is not None:
                self._load_data_variables(X_train)