Пример #1
0
    def test_extensive_eda(self):
        """
        Test for extensive_eda feature
        """

        X, y = datasets.make_regression(n_samples=100, n_features=5)

        X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
        y = pd.Series(y, name="class")

        results_path = self.automl_dir
        EDA.extensive_eda(X, y, results_path)
        result_files = os.listdir(results_path)

        for col in X.columns:
            self.assertTrue(f"{col}_target.png" in result_files)
        self.assertTrue("heatmap.png" in result_files)
        self.assertTrue("Extensive_EDA.md" in result_files)

        X, y = datasets.make_classification(n_samples=100, n_features=5)

        X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
        y = pd.Series(y, name="class")

        results_path = self.automl_dir
        EDA.extensive_eda(X, y, results_path)
        result_files = os.listdir(results_path)

        for col in X.columns:
            self.assertTrue(f"{col}_target.png" in result_files)
        self.assertTrue("heatmap.png" in result_files)
        self.assertTrue("Extensive_EDA.md" in result_files)

        self.tearDown()
Пример #2
0
 def test_naughty_column_name_to_filename(self):
     """ Test with naughty strings.
         String from https://github.com/minimaxir/big-list-of-naughty-strings """
     os.mkdir(self.automl_dir)
     naughty_columns = [
         "feature_1",
         "*",
         "😍",
         "¯\_(ツ)_/¯",
         "表",
         "𠜎𠜱𠝹𠱓",
         "عاملة بولندا",
         "Ṱ̺̺̕o͞ ̷" "🇸🇦🇫🇦🇲",
         "⁰⁴⁵",
         "∆˚¬…æ",
         "!@#$%^&*()`~",
         "onfocus=JaVaSCript:alert(123) autofocus",
         "`\"'><img src=xxx:x \x20onerror=javascript:alert(1)>",
         'System("ls -al /")',
         'Kernel.exec("ls -al /")',
         "لُلُصّبُلُل" "{% print 'x' * 64 * 1024**3 %}",
         '{{ "".__class__.__mro__[2].__subclasses__()[40]("/etc/passwd").read() }}',
         "ÜBER Über German Umlaut",
         "影師嗎",
         "C'est déjà l'été." "Nín hǎo. Wǒ shì zhōng guó rén",
         "Компьютер",
         "jaja---lol-méméméoo--a",
     ]
     for col in naughty_columns:
         fname = EDA.plot_path(self.automl_dir, col)
         with open(fname, "w") as fout:
             fout.write("ok")
Пример #3
0
    def test_symbol_feature(self):
        """
        Test for columns with forbidden filenames
        """

        X, y = datasets.make_regression(n_samples=100, n_features=5)

        X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
        X.rename({"f_0": "ff*", "f_1": "fg/"}, axis=1, inplace=True)
        y = pd.Series(y, name="class")

        results_path = self.automl_dir
        EDA.extensive_eda(X, y, results_path)
        result_files = os.listdir(results_path)

        for col in X.columns:
            self.assertTrue(EDA.plot_fname(f"{col}_target") in result_files)
        self.assertTrue("heatmap.png" in result_files)
        self.assertTrue("Extensive_EDA.md" in result_files)

        self.tearDown()
Пример #4
0
    def test_extensive_eda_missing(self):
        """
        Test for dataframe with missing values
        """

        X, y = datasets.make_regression(n_samples=100, n_features=5)

        X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
        y = pd.Series(y, name="class")

        ##add some nan values
        X.loc[np.random.randint(0, 100, 20), "f_0"] = np.nan

        results_path = self.automl_dir
        EDA.extensive_eda(X, y, results_path)
        result_files = os.listdir(results_path)

        for col in X.columns:
            self.assertTrue(f"{col}_target.png" in result_files)
        self.assertTrue("heatmap.png" in result_files)
        self.assertTrue("Extensive_EDA.md" in result_files)

        X, y = datasets.make_regression(n_samples=100, n_features=5)

        X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
        y = pd.Series(y, name="class")

        ##add some nan values
        X.loc[np.random.randint(0, 100, 20), "f_0"] = np.nan

        results_path = self.automl_dir
        EDA.extensive_eda(X, y, results_path)
        result_files = os.listdir(results_path)

        for col in X.columns:
            self.assertTrue(f"{col}_target.png" in result_files)
        self.assertTrue("heatmap.png" in result_files)
        self.assertTrue("Extensive_EDA.md" in result_files)

        self.tearDown()
Пример #5
0
    def _fit(self, X, y):
        """Fits the AutoML model with data"""
        if self._fit_level == "finished":
            print(
                "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new a 'fit()'."
            )
            return
        # Validate input and build dataframes
        X, y = self._build_dataframe(X, y)

        self.n_features_in_ = X.shape[1]
        self.n_classes = len(np.unique(y[~pd.isnull(y)]))

        # Get attributes (__init__ params)
        self._mode = self._get_mode()
        self._ml_task = self._get_ml_task()
        self._results_path = self._get_results_path()
        self._total_time_limit = self._get_total_time_limit()
        self._model_time_limit = self._get_model_time_limit()
        self._algorithms = self._get_algorithms()
        self._train_ensemble = self._get_train_ensemble()
        self._stack_models = self._get_stack_models()
        self._eval_metric = self._get_eval_metric()
        self._validation_strategy = self._get_validation_strategy()
        self._verbose = self._get_verbose()
        self._explain_level = self._get_explain_level()
        self._golden_features = self._get_golden_features()
        self._feature_selection = self._get_feature_selection()
        self._start_random_models = self._get_start_random_models()
        self._hill_climbing_steps = self._get_hill_climbing_steps()
        self._top_models_to_improve = self._get_top_models_to_improve()
        self._random_state = self._get_random_state()

        try:

            self.load_progress()
            if self._fit_level == "finished":
                print(
                    "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'."
                )
                return
            self._check_can_load()

            self.verbose_print(f"AutoML directory: {self._results_path}")
            self.verbose_print(
                f"The task is {self._ml_task} with evaluation metric {self._eval_metric}"
            )
            self.verbose_print(
                f"AutoML will use algorithms: {self._algorithms}")
            if self._stack_models:
                self.verbose_print("AutoML will stack models")
            if self._train_ensemble:
                self.verbose_print("AutoML will ensemble availabe models")

            self._start_time = time.time()
            if self._time_ctrl is not None:
                self._start_time -= self._time_ctrl.already_spend()

            # Automatic Exloratory Data Analysis
            if self._explain_level == 2:
                EDA.compute(X, y, os.path.join(self._results_path, "EDA"))

            # Save data
            self._save_data(X.copy(deep=False), y)

            tuner = MljarTuner(
                self._get_tuner_params(
                    self._start_random_models,
                    self._hill_climbing_steps,
                    self._top_models_to_improve,
                ),
                self._algorithms,
                self._ml_task,
                self._validation_strategy,
                self._explain_level,
                self._data_info,
                self._golden_features,
                self._feature_selection,
                self._train_ensemble,
                self._stack_models,
                self._random_state,
            )
            self.tuner = tuner

            steps = tuner.steps()
            self.verbose_print(f"AutoML steps: {steps}")
            if self._time_ctrl is None:
                self._time_ctrl = TimeController(
                    self._start_time,
                    self._total_time_limit,
                    self._model_time_limit,
                    steps,
                    self._algorithms,
                )

            self._time_ctrl.log_time(
                "prepare_data",
                "prepare_data",
                "prepare_data",
                time.time() - self._start_time,
            )

            for step in steps:
                self._fit_level = step
                start = time.time()
                # self._time_start[step] = start

                if step == "stack":
                    self.prepare_for_stacking()

                generated_params = []
                if step in self._all_params:
                    generated_params = self._all_params[step]
                else:
                    generated_params = tuner.generate_params(
                        step, self._models, self._results_path,
                        self._stacked_models)

                if generated_params is None or not generated_params:
                    self.verbose_print(
                        f"Skip {step} because no parameters were generated.")
                    continue
                if generated_params:
                    if "learner" in generated_params[
                            0] and not self._time_ctrl.enough_time(
                                generated_params[0]["learner"]["model_type"],
                                self._fit_level):
                        self.verbose_print(
                            f"Skip {step} because of the time limit.")
                    else:
                        model_str = "models" if len(
                            generated_params) > 1 else "model"
                        self.verbose_print(
                            f"* Step {step} will try to check up to {len(generated_params)} {model_str}"
                        )

                for params in generated_params:
                    if params.get("status",
                                  "") in ["trained", "skipped", "error"]:
                        self.verbose_print(
                            f"{params['name']}: {params['status']}.")
                        continue

                    try:
                        trained = False
                        if "ensemble" in step:
                            trained = self.ensemble_step(
                                is_stacked=params["is_stacked"])
                        else:
                            trained = self.train_model(params)
                        params["status"] = "trained" if trained else "skipped"
                        params["final_loss"] = self._models[-1].get_final_loss(
                        )
                        params["train_time"] = self._models[-1].get_train_time(
                        )
                    except Exception as e:
                        self._update_errors_report(params.get("name"), str(e))
                        params["status"] = "error"

                    self.save_progress(step, generated_params)

            self._fit_level = "finished"
            self.save_progress()

            self.verbose_print(
                f"AutoML fit time: {np.round(time.time() - self._start_time,2)} seconds"
            )

        except Exception as e:
            raise e
        finally:
            if self._X_path is not None:
                self._load_data_variables(X)

        return self
Пример #6
0
    def fit(self, X_train, y_train, X_validation=None, y_validation=None):
        """
        Fit AutoML
        
        :param X_train: Pandas DataFrame with training data.
        :param y_train: Numpy Array with target training data.
        
        :param X_validation: Pandas DataFrame with validation data. (Not implemented yet)
        :param y_validation: Numpy Array with target of validation data. (Not implemented yet)
        """

        try:

            self.load_progress()

            if self._fit_level == "finished":
                print("AutoML is trained. Skipping fit step ...")
                return

            # if self._best_model is not None:
            #    print("Best model is already set, no need to run fit. Skipping ...")
            #    return

            self._start_time = time.time()
            if self._time_ctrl is not None:
                self._start_time -= self._time_ctrl.already_spend()

            if not isinstance(X_train, pd.DataFrame):
                raise AutoMLException(
                    "AutoML needs X_train matrix to be a Pandas DataFrame"
                )

            # Automatic Exloratory Data Analysis
            if self._explain_level == 2:
                EDA.compute(X_train, y_train, os.path.join(self._results_path, "EDA"))

            self._set_ml_task(y_train)

            if X_train is not None:
                X_train = X_train.copy(deep=False)

            X_train, y_train, X_validation, y_validation = self._initial_prep(
                X_train, y_train, X_validation, y_validation
            )
            self._save_data(X_train, y_train, X_validation, y_validation)
            self._set_algorithms()
            self._set_metric()

            if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]:
                self._check_imbalanced(y_train)

            tuner = MljarTuner(
                self._tuner_params,
                self._algorithms,
                self._ml_task,
                self._validation,
                self._explain_level,
                self._data_info,
                self._golden_features,
                self._feature_selection,
                self._train_ensemble,
                self._stack_models,
                self._seed,
            )
            self.tuner = tuner

            steps = tuner.steps()

            if self._time_ctrl is None:
                self._time_ctrl = TimeController(
                    self._start_time,
                    self._total_time_limit,
                    self._model_time_limit,
                    steps,
                    self._algorithms,
                )

            self._time_ctrl.log_time(
                "prepare_data", "prepare_data", "prepare_data", time.time() - self._start_time
            )

            for step in steps:
                self._fit_level = step
                start = time.time()
                # self._time_start[step] = start

                if step == "stack":
                    self.prepare_for_stacking()

                generated_params = []
                if step in self._all_params:
                    generated_params = self._all_params[step]
                else:
                    generated_params = tuner.generate_params(
                        step, self._models, self._results_path, self._stacked_models
                    )

                if generated_params is None:
                    continue
                if generated_params:
                    print("-" * 72)
                    print(f"{step} with {len(generated_params)} models to train ...")

                for params in generated_params:
                    if params.get("status", "") == "trained":
                        print(f"Skipping {params['name']}, already trained.")
                        continue
                    if params.get("status", "") == "skipped":
                        print(f"Skipped {params['name']}.")
                        continue

                    trained = False
                    if "ensemble" in step:
                        trained = self.ensemble_step(is_stacked=params["is_stacked"])
                    else:
                        trained = self.train_model(params)

                    params["status"] = "trained" if trained else "skipped"
                    params["final_loss"] = self._models[-1].get_final_loss()
                    params["train_time"] = self._models[-1].get_train_time()
                    self.save_progress(step, generated_params)

            self._fit_level = "finished"
            self.save_progress()

            print(f"AutoML fit time: {time.time() - self._start_time}")

        except Exception as e:
            raise e
        finally:
            if self._X_train_path is not None:
                self._load_data_variables(X_train)
Пример #7
0
 def test_column_name_to_filename(self):
     """ Valid feature name should be untouched """
     col = "feature_1"
     self.assertEqual(EDA.prepare(col), col)
Пример #8
0
    def _fit(self, X, y):
        """Fits the AutoML model with data"""
        if self._fit_level == "finished":
            return print(
                "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new a 'fit()'."
            )
        # Validate input and build dataframes
        X, y = self._build_dataframe(X, y)

        self.n_features = X.shape[1]
        self.n_classes = len(np.unique(y[~pd.isnull(y)]))

        # Get attributes (__init__ params)
        self._mode = self._get_mode()
        self._ml_task = self._get_ml_task()
        self._tuning_mode = self._get_tuning_mode()
        self._results_path = self._get_results_path()
        self._total_time_limit = self._get_total_time_limit()
        self._model_time_limit = self._get_model_time_limit()
        self._algorithms = self._get_algorithms()
        self._train_ensemble = self._get_train_ensemble()
        self._stack_models = self._get_stack_models()
        self._eval_metric = self._get_eval_metric()
        self._validation_strategy = self._get_validation_strategy()
        self._verbose = self._get_verbose()
        self._explain_level = self._get_explain_level()
        self._golden_features = self._get_golden_features()
        self._feature_selection = self._get_feature_selection()
        self._start_random_models = self._get_start_random_models()
        self._hill_climbing_steps = self._get_hill_climbing_steps()
        self._top_models_to_improve = self._get_top_models_to_improve()
        self._random_state = self._get_random_state()

        try:

            self.load_progress()
            if self._fit_level == "finished":
                print(
                    "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'."
                )
                return

            # Validate input and build dataframes
            X, y = self._build_dataframe(X, y)

            self.n_features = X.shape[1]
            self.n_classes = len(np.unique(y[~pd.isnull(y)]))

            self.verbose_print(
                f"AutoML current directory: {self._results_path}")
            self.verbose_print(f"AutoML current task: {self._ml_task}")
            self.verbose_print(
                f"AutoML will use algorithms : {self._algorithms}")
            self.verbose_print(
                f"AutoML will optimize for metric : {self._eval_metric}")

            self._start_time = time.time()
            if self._time_ctrl is not None:
                self._start_time -= self._time_ctrl.already_spend()

            # Automatic Exloratory Data Analysis
            if self._explain_level == 2:
                EDA.compute(X, y, os.path.join(self._results_path, "EDA"))

            # Save data
            self._save_data(X, y)

            tuner = MljarTuner(
                self._get_tuner_params(
                    self._start_random_models,
                    self._hill_climbing_steps,
                    self._top_models_to_improve,
                ),
                self._algorithms,
                self._ml_task,
                self._validation_strategy,
                self._explain_level,
                self._data_info,
                self._golden_features,
                self._feature_selection,
                self._train_ensemble,
                self._stack_models,
                self._random_state,
            )
            self.tuner = tuner

            steps = tuner.steps()
            self.verbose_print(f"AutoML steps: {steps}")
            if self._time_ctrl is None:
                self._time_ctrl = TimeController(
                    self._start_time,
                    self._total_time_limit,
                    self._model_time_limit,
                    steps,
                    self._algorithms,
                )

            self._time_ctrl.log_time(
                "prepare_data",
                "prepare_data",
                "prepare_data",
                time.time() - self._start_time,
            )

            for step in steps:
                self._fit_level = step
                start = time.time()
                # self._time_start[step] = start

                if step == "stack":
                    self.prepare_for_stacking()

                generated_params = []
                if step in self._all_params:
                    generated_params = self._all_params[step]
                else:
                    generated_params = tuner.generate_params(
                        step, self._models, self._results_path,
                        self._stacked_models)

                if generated_params is None:
                    continue
                if generated_params:
                    self.verbose_print("-" * 72)
                    self.verbose_print(
                        f"{step} with {len(generated_params)} models to train ..."
                    )

                for params in generated_params:
                    if params.get("status", "") == "trained":
                        self.verbose_print(
                            f"Skipping {params['name']}, already trained.")
                        continue
                    if params.get("status", "") == "skipped":
                        self.verbose_print(f"Skipped {params['name']}.")
                        continue

                    trained = False
                    if "ensemble" in step:
                        trained = self.ensemble_step(
                            is_stacked=params["is_stacked"])
                    else:
                        trained = self.train_model(params)

                    params["status"] = "trained" if trained else "skipped"
                    params["final_loss"] = self._models[-1].get_final_loss()
                    params["train_time"] = self._models[-1].get_train_time()
                    self.save_progress(step, generated_params)

            self._fit_level = "finished"
            self.save_progress()

            self.verbose_print(
                f"AutoML fit time: {time.time() - self._start_time}")

        except Exception as e:
            raise e
        finally:
            if self._X_path is not None:
                self._load_data_variables(X)

        return self