예제 #1
0
 def test_breast_cancer_dataset(self):
     """ Tests AutoML in the breast cancer (binary classification)"""
     model = AutoML(explain_level=0,
                    verbose=0,
                    random_state=1,
                    results_path=self.automl_dir)
     score = model.fit(breast_cancer.data,
                       breast_cancer.target).score(breast_cancer.data,
                                                   breast_cancer.target)
     self.assertGreater(score, 0.5)
    def test_set_model_time_limit(self):
        model_type = "Xgboost"
        automl = AutoML(results_path=self.automl_dir,
                        model_time_limit=10,
                        algorithms=[model_type])

        for _ in range(12):
            automl.log_train_time(model_type, 10)
            # should be always true
            self.assertTrue(automl._enough_time_to_train(model_type))
예제 #3
0
    def test_too_small_time_limit(self):
        rows = 100000
        X = np.random.uniform(size=(rows, 100))
        y = np.random.randint(0, 2, size=(rows, ))

        automl = AutoML(results_path=self.automl_dir,
                        total_time_limit=1,
                        train_ensemble=False)
        with self.assertRaises(AutoMLException) as context:
            automl.fit(X, y)
예제 #4
0
    def test_score_without_y(self):
        """Tests the use of `score()` without passing y. Should raise AutoMLException"""
        model = AutoML(explain_level=0, verbose=0, random_state=1)
        # Assert than an Exception is raised
        with self.assertRaises(AutoMLException) as context:
            # Try to score without passing 'y'
            score = model.fit(breast_cancer.data,
                              breast_cancer.target).score(breast_cancer.data)

        self.assertTrue("y must be specified" in str(context.exception))
    def test_custom_init(self):

        X = np.random.uniform(size=(30, 2))
        y = np.random.randint(0, 2, size=(30, ))

        automl = AutoML(results_path=self.automl_dir)
        automl._update_errors_report("model_1", "bad error")

        errors_filename = os.path.join(self.automl_dir, "errors.md")
        self.assertTrue(os.path.exists(errors_filename))
        self.assertTrue("bad error" in open(errors_filename).read())
예제 #6
0
 def __init__(self, n_folds_validation: int, shuffle_data: bool,
              max_rand: int) -> None:
     super().__init__(n_folds_validation, shuffle_data, max_rand)
     # initialize _clf as AutoMl type
     self.estimator = AutoML(mode="Compete",
                             explain_level=0,
                             random_state=self._random_state,
                             validation_strategy={
                                 "validation_type": "kfold",
                                 "k_folds": self._n_folds_validation,
                                 "shuffle": self._shuffle_data
                             })
예제 #7
0
 def test_new_directory(self):
     """ Directory does not exist, create it """
     # Assert directory does not exist
     self.assertTrue(not os.path.exists(self.automl_dir))
     # Create model with dir
     model = AutoML(results_path=self.automl_dir)
     # Generate data
     X, y = datasets.make_classification(n_samples=30)
     # Fit data
     model.fit(
         X, y)  # AutoML only validates constructor params on `fit()` call
     # Assert directory was created
     self.assertTrue(os.path.exists(self.automl_dir))
예제 #8
0
    def test_bin_class_01(self):
        X = np.random.rand(self.rows, 3)
        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
        y = np.random.randint(0, 2, self.rows)

        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=1,
            tuning_mode="Insane",
            algorithms=["Xgboost"],
        )
        automl._estimate_training_times()
        self.assertEqual(automl._start_random_models, 15)
    def test_set_model_time_limit_omit_total_time(self):
        model_type = "Xgboost"
        automl = AutoML(
            results_path=self.automl_dir,
            model_time_limit=10,
            total_time_limit=10,  # this parameter setting should be omitted
            algorithms=[model_type],
        )

        for _ in range(12):
            automl.log_train_time(model_type, 10)
            # should be always true
            self.assertTrue(automl._enough_time_to_train(model_type))
예제 #10
0
    def test_tune_only_default(self):
        X = np.random.rand(self.rows, 3)
        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
        y = np.random.randint(0, 2, self.rows)

        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=3,
            tuning_mode="Explain",
            algorithms=["Decision Tree"],
            explain_level=0,
            train_ensemble=False,
        )
        automl.fit(X, y)

        iter_1_models_cnt = len(automl._models)

        progress = json.load(
            open(os.path.join(self.automl_dir, "progress.json"), "r"))
        progress["fit_level"] = "default_algorithms"

        with open(os.path.join(self.automl_dir, "progress.json"), "w") as fout:
            fout.write(json.dumps(progress, indent=4))

        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=3,
            tuning_mode="Explain",
            algorithms=["Decision Tree", "Xgboost"],
            explain_level=0,
            train_ensemble=False,
        )
        automl.fit(X, y)

        self.assertTrue(len(automl._models) > iter_1_models_cnt)
예제 #11
0
    def test_bin_class_AB_missing_targets(self):
        X = np.random.rand(self.rows, 3)
        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
        y = pd.Series(np.random.permutation(["a", "B"] * int(self.rows / 2)),
                      name="target")

        y.iloc[1] = None
        y.iloc[3] = np.NaN
        y.iloc[13] = np.nan

        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=1,
            algorithms=["Xgboost"],
            train_ensemble=False,
        )
        automl.set_advanced(start_random_models=1)
        automl.fit(X, y)
        p = automl.predict(X)
        pred = automl.predict(X)
        for col in ["prediction_a", "prediction_B", "label"]:
            self.assertTrue(col in pred.columns.tolist())
        u = np.unique(pred["label"].values)
        self.assertTrue("a" in u or "B" in u)
        self.assertTrue(len(u) <= 2)
    def test_compute_predictions_after_dir_change(self):
        #
        # test for https://github.com/mljar/mljar-supervised/issues/384
        #
        self.create_dir(self.automl_dir_a)
        self.create_dir(self.automl_dir_b)

        path_a = os.path.join(self.automl_dir_a, self.automl_dir)
        path_b = os.path.join(self.automl_dir_b, self.automl_dir)

        X, y = datasets.make_regression(
            n_samples=100,
            n_features=5,
            n_informative=4,
            n_targets=1,
            shuffle=False,
            random_state=0,
        )

        automl = AutoML(
            results_path=path_a,
            explain_level=0,
            ml_task="regression",
            total_time_limit=10,
        )
        automl.fit(X, y)
        p = automl.predict(X[:3])

        shutil.move(path_a, path_b)

        automl2 = AutoML(results_path=path_b, )
        p2 = automl2.predict(X[:3])

        for i in range(3):
            assert_almost_equal(p[i], p2[i])
    def test_enough_time_to_train(self):
        model_type = "Xgboost"
        model_type_2 = "LightGBM"

        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=10,  # this parameter setting should be omitted
            algorithms=[model_type, model_type_2],
        )

        for i in range(5):
            # should be always true
            self.assertTrue(automl._enough_time_to_train(model_type))
            automl.log_train_time(model_type, 1)
예제 #14
0
    def test_tune_only_default(self):
        X = np.random.rand(self.rows, 3)
        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
        y = np.random.randint(0, 2, self.rows)

        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=1,
            tuning_mode="Insane",
            algorithms=["Xgboost"],
        )

        automl.fit(X, y)
        self.assertEqual(len(automl._models), 1)
예제 #15
0
    def test_save_load(self):
        a = AutoML(
            results_path=self.automl_dir,
            total_time_limit=10,
            explain_level=0,
            mode="Explain",
            train_ensemble=True,
            start_random_models=1,
        )

        X, y = datasets.make_classification(
            n_samples=100,
            n_features=5,
            n_informative=4,
            n_redundant=1,
            n_classes=2,
            n_clusters_per_class=3,
            n_repeated=0,
            shuffle=False,
            random_state=0,
        )
        X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])

        a.fit(X, y)
        p = a.predict(X)

        a2 = AutoML(results_path=self.automl_dir)
        p2 = a2.predict(X)

        self.assertTrue((p == p2).all())
예제 #16
0
    def test_tune_only_default(self):
        X = np.random.rand(self.rows, 3)
        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
        y = np.random.randint(0, 2, self.rows)

        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=3,
            algorithms=["Decision Tree"],
            explain_level=0,
            train_ensemble=False,
        )
        automl.fit(X, y)

        # Get number of starting models
        n1 = len([x for x in os.listdir(self.automl_dir) if x[0].isdigit()])

        progress = json.load(
            open(os.path.join(self.automl_dir, "progress.json"), "r"))
        progress["fit_level"] = "default_algorithms"

        with open(os.path.join(self.automl_dir, "progress.json"), "w") as fout:
            fout.write(json.dumps(progress, indent=4))

        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=3,
            algorithms=["Decision Tree", "Xgboost"],
            explain_level=0,
            train_ensemble=False,
        )
        automl.fit(X, y)
        # Get number of models after second fit
        n2 = len([x for x in os.listdir(self.automl_dir) if x[0].isdigit()])
        self.assertGreater(n2, n1)
    def test_set_model_time_limit(self):
        model_type = "Xgboost"
        automl = AutoML(results_path=self.automl_dir,
                        model_time_limit=10,
                        algorithms=[model_type])
        automl._time_ctrl = TimeController(
            time.time(), None, 10, ["simple_algorithms", "not_so_random"],
            "Xgboost")

        for i in range(12):
            automl._time_ctrl.log_time(f"Xgboost_{i}", model_type,
                                       "not_so_random", 10)
            # should be always true
            self.assertTrue(
                automl._time_ctrl.enough_time(model_type, "not_so_random"))
예제 #18
0
    def test_encoding_strange_characters(self):

        X = np.random.rand(self.rows, 3)
        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
        y = np.random.permutation(["ɛ", "🂲"] * int(self.rows / 2))

        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=1,
            algorithms=["Baseline"],
            train_ensemble=False,
            explain_level=0,
            start_random_models=1,
        )
        automl.fit(X, y)
예제 #19
0
    def test_one_column_input_regression(self):
        a = AutoML(
            results_path=self.automl_dir,
            total_time_limit=5,
            explain_level=0,
            start_random_models=1,
        )

        X, y = datasets.make_regression(n_features=1)

        a.fit(X, y)
        p = a.predict(X)

        self.assertIsInstance(p, np.ndarray)
        self.assertEqual(len(p), X.shape[0])
    def test_disable_stack_models_adjusted_validation(self):

        X = np.random.uniform(size=(100, 2))
        y = np.random.randint(0, 2, size=(100, ))
        X[:, 0] = y
        X[:, 1] = -y

        automl = AutoML(results_path=self.automl_dir,
                        total_time_limit=5,
                        mode="Compete")
        automl.fit(X, y)
        # the stacking should be disabled
        # because of small time limit
        self.assertFalse(automl._stack_models)
        self.assertFalse(automl.tuner._stack_models)
        self.assertFalse(automl._time_ctrl._is_stacking)
예제 #21
0
    def test_one_column_input_bin_class(self):
        a = AutoML(
            results_path=self.automl_dir,
            total_time_limit=5,
            explain_level=0,
            start_random_models=1,
        )

        X = pd.DataFrame({"feature_1": np.random.rand(100)})
        y = (np.random.rand(X.shape[0]) > 0.5).astype(int)

        a.fit(X, y)
        p = a.predict(X)

        self.assertIsInstance(p, np.ndarray)
        self.assertEqual(len(p), X.shape[0])
예제 #22
0
 def test_empty_directory(self):
     """ Directory exists and is empty, use it """
     # Assert directory does not exist
     self.assertTrue(not os.path.exists(self.automl_dir))
     # Make dir
     os.mkdir(self.automl_dir)
     # Assert dir exists
     self.assertTrue(os.path.exists(self.automl_dir))
     # Create automl with dir
     model = AutoML(results_path=self.automl_dir)
     # Generate data
     X, y = datasets.make_classification(n_samples=30)
     # Fit data
     model.fit(
         X, y)  # AutoML only validates constructor params on `fit()` call
     self.assertTrue(os.path.exists(self.automl_dir))
    def test_expected_learners_cnt(self):
        automl = AutoML(results_path=self.automl_dir)
        automl._validation_strategy = {"k_folds": 7, "repeats": 6}
        self.assertEqual(automl._expected_learners_cnt(), 42)

        automl._validation_strategy = {"k_folds": 7}
        self.assertEqual(automl._expected_learners_cnt(), 7)
        automl._validation_strategy = {}
        self.assertEqual(automl._expected_learners_cnt(), 1)
    def test_disable_stack_models(self):

        X = np.random.uniform(size=(100, 2))
        y = np.random.randint(0, 2, size=(100, ))
        X[:, 0] = y
        X[:, 1] = -y

        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=5,
            mode="Compete",
            validation_strategy={"validation_type": "split"},
        )
        automl.fit(X, y)
        self.assertFalse(automl._stack_models)
        self.assertFalse(automl.tuner._stack_models)
        self.assertFalse(automl._time_ctrl._is_stacking)
예제 #25
0
    def test_different_input_types(self):
        """ Test the different data input types for AutoML"""
        model = AutoML(
            total_time_limit=10,
            explain_level=0,
            start_random_models=1,
            algorithms=["Linear"],
            verbose=0,
        )
        X, y = datasets.make_regression()

        # First test - X and y as numpy arrays

        pred = model.fit(X, y).predict(X)

        self.assertIsInstance(pred, np.ndarray)
        self.assertEqual(len(pred), X.shape[0])

        del model

        model = AutoML(
            total_time_limit=10,
            explain_level=0,
            start_random_models=1,
            algorithms=["Linear"],
            verbose=0,
        )
        # Second test - X and y as pandas dataframe
        X_pandas = pd.DataFrame(X)
        y_pandas = pd.DataFrame(y)
        pred_pandas = model.fit(X_pandas, y_pandas).predict(X_pandas)

        self.assertIsInstance(pred_pandas, np.ndarray)
        self.assertEqual(len(pred_pandas), X.shape[0])

        del model

        model = AutoML(
            total_time_limit=10,
            explain_level=0,
            start_random_models=1,
            algorithms=["Linear"],
            verbose=0,
        )
        # Third test - X and y as lists
        X_list = pd.DataFrame(X).values.tolist()
        y_list = pd.DataFrame(y).values.tolist()
        pred_list = model.fit(X_pandas, y_pandas).predict(X_pandas)

        self.assertIsInstance(pred_list, np.ndarray)
        self.assertEqual(len(pred_list), X.shape[0])
예제 #26
0
 def test_dont_use_directory_if_non_empty_exists_without_params_json(self):
     """ Directory exists and is not empty, dont use it, raise exception """
     os.mkdir(self.automl_dir)
     open(os.path.join(self.automl_dir, "test.file"), "w").close()
     self.assertTrue(os.path.exists(self.automl_dir))
     with self.assertRaises(AutoMLException) as context:
         a = AutoML(results_path=self.automl_dir)
     self.assertTrue("not empty" in str(context.exception))
예제 #27
0
    def test_category_data_type(self):

        X = np.random.rand(self.rows, 3)
        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
        y = np.random.randint(0, 2, self.rows)

        X["f1"] = X["f1"].astype("category")

        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=1,
            algorithms=["CatBoost"],
            train_ensemble=False,
            explain_level=0,
            start_random_models=1,
        )
        automl.fit(X, y)
예제 #28
0
    def test_repeated_kfold(self):
        REPEATS = 3
        FOLDS = 2

        a = AutoML(
            results_path=self.automl_dir,
            total_time_limit=10,
            algorithms=["Random Forest"],
            train_ensemble=False,
            validation_strategy={
                "validation_type": "kfold",
                "k_folds": FOLDS,
                "repeats": REPEATS,
                "shuffle": True,
                "stratify": True,
            },
            start_random_models=1,
        )

        X, y = datasets.make_classification(
            n_samples=100,
            n_features=5,
            n_informative=4,
            n_redundant=1,
            n_classes=2,
            n_clusters_per_class=3,
            n_repeated=0,
            shuffle=False,
            random_state=0,
        )
        X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])

        a.fit(X, y)

        result_files = os.listdir(
            os.path.join(self.automl_dir, "1_Default_RandomForest"))

        cnt = 0
        for repeat in range(REPEATS):
            for fold in range(FOLDS):
                learner_name = construct_learner_name(fold, repeat, REPEATS)
                self.assertTrue(
                    f"{learner_name}.random_forest" in result_files)
                self.assertTrue(f"{learner_name}_training.log" in result_files)
                cnt += 1
        self.assertTrue(cnt, 6)
예제 #29
0
    def test_multi_class_0123(self):
        X = np.random.rand(self.rows * 4, 3)
        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
        y = np.random.randint(0, 4, self.rows * 4)

        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=1,
            algorithms=["Xgboost"],
            train_ensemble=False,
        )
        automl.set_advanced(start_random_models=1)
        automl.fit(X, y)
        pred = automl.predict(X)

        for col in [
                "prediction_0",
                "prediction_1",
                "prediction_2",
                "prediction_3",
                "label",
        ]:
            self.assertTrue(col in pred.columns.tolist())
        u = np.unique(pred["label"].values)

        self.assertTrue("0" in u or "1" in u or "2" in u or "3" in u)
        self.assertTrue(len(u) <= 4)
예제 #30
0
    def test_multi_class_abcd_missing_target(self):
        X = np.random.rand(self.rows * 4, 3)
        X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
        y = pd.Series(np.random.permutation(["a", "B", "CC", "d"] * self.rows),
                      name="target")

        y.iloc[1] = None
        automl = AutoML(
            results_path=self.automl_dir,
            total_time_limit=1,
            algorithms=["Xgboost"],
            train_ensemble=False,
        )
        automl.set_advanced(start_random_models=1)
        automl.fit(X, y)
        pred = automl.predict(X)

        for col in [
                "prediction_a",
                "prediction_B",
                "prediction_CC",
                "prediction_d",
                "label",
        ]:
            self.assertTrue(col in pred.columns.tolist())
        u = np.unique(pred["label"].values)

        self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0)
        self.assertTrue(len(u) <= 4)