def test_breast_cancer_dataset(self): """ Tests AutoML in the breast cancer (binary classification)""" model = AutoML(explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir) score = model.fit(breast_cancer.data, breast_cancer.target).score(breast_cancer.data, breast_cancer.target) self.assertGreater(score, 0.5)
def test_set_model_time_limit(self): model_type = "Xgboost" automl = AutoML(results_path=self.automl_dir, model_time_limit=10, algorithms=[model_type]) for _ in range(12): automl.log_train_time(model_type, 10) # should be always true self.assertTrue(automl._enough_time_to_train(model_type))
def test_too_small_time_limit(self): rows = 100000 X = np.random.uniform(size=(rows, 100)) y = np.random.randint(0, 2, size=(rows, )) automl = AutoML(results_path=self.automl_dir, total_time_limit=1, train_ensemble=False) with self.assertRaises(AutoMLException) as context: automl.fit(X, y)
def test_score_without_y(self): """Tests the use of `score()` without passing y. Should raise AutoMLException""" model = AutoML(explain_level=0, verbose=0, random_state=1) # Assert than an Exception is raised with self.assertRaises(AutoMLException) as context: # Try to score without passing 'y' score = model.fit(breast_cancer.data, breast_cancer.target).score(breast_cancer.data) self.assertTrue("y must be specified" in str(context.exception))
def test_custom_init(self): X = np.random.uniform(size=(30, 2)) y = np.random.randint(0, 2, size=(30, )) automl = AutoML(results_path=self.automl_dir) automl._update_errors_report("model_1", "bad error") errors_filename = os.path.join(self.automl_dir, "errors.md") self.assertTrue(os.path.exists(errors_filename)) self.assertTrue("bad error" in open(errors_filename).read())
def __init__(self, n_folds_validation: int, shuffle_data: bool, max_rand: int) -> None: super().__init__(n_folds_validation, shuffle_data, max_rand) # initialize _clf as AutoMl type self.estimator = AutoML(mode="Compete", explain_level=0, random_state=self._random_state, validation_strategy={ "validation_type": "kfold", "k_folds": self._n_folds_validation, "shuffle": self._shuffle_data })
def test_new_directory(self): """ Directory does not exist, create it """ # Assert directory does not exist self.assertTrue(not os.path.exists(self.automl_dir)) # Create model with dir model = AutoML(results_path=self.automl_dir) # Generate data X, y = datasets.make_classification(n_samples=30) # Fit data model.fit( X, y) # AutoML only validates constructor params on `fit()` call # Assert directory was created self.assertTrue(os.path.exists(self.automl_dir))
def test_bin_class_01(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 2, self.rows) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, tuning_mode="Insane", algorithms=["Xgboost"], ) automl._estimate_training_times() self.assertEqual(automl._start_random_models, 15)
def test_set_model_time_limit_omit_total_time(self): model_type = "Xgboost" automl = AutoML( results_path=self.automl_dir, model_time_limit=10, total_time_limit=10, # this parameter setting should be omitted algorithms=[model_type], ) for _ in range(12): automl.log_train_time(model_type, 10) # should be always true self.assertTrue(automl._enough_time_to_train(model_type))
def test_tune_only_default(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 2, self.rows) automl = AutoML( results_path=self.automl_dir, total_time_limit=3, tuning_mode="Explain", algorithms=["Decision Tree"], explain_level=0, train_ensemble=False, ) automl.fit(X, y) iter_1_models_cnt = len(automl._models) progress = json.load( open(os.path.join(self.automl_dir, "progress.json"), "r")) progress["fit_level"] = "default_algorithms" with open(os.path.join(self.automl_dir, "progress.json"), "w") as fout: fout.write(json.dumps(progress, indent=4)) automl = AutoML( results_path=self.automl_dir, total_time_limit=3, tuning_mode="Explain", algorithms=["Decision Tree", "Xgboost"], explain_level=0, train_ensemble=False, ) automl.fit(X, y) self.assertTrue(len(automl._models) > iter_1_models_cnt)
def test_bin_class_AB_missing_targets(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.permutation(["a", "B"] * int(self.rows / 2)), name="target") y.iloc[1] = None y.iloc[3] = np.NaN y.iloc[13] = np.nan automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, ) automl.set_advanced(start_random_models=1) automl.fit(X, y) p = automl.predict(X) pred = automl.predict(X) for col in ["prediction_a", "prediction_B", "label"]: self.assertTrue(col in pred.columns.tolist()) u = np.unique(pred["label"].values) self.assertTrue("a" in u or "B" in u) self.assertTrue(len(u) <= 2)
def test_compute_predictions_after_dir_change(self): # # test for https://github.com/mljar/mljar-supervised/issues/384 # self.create_dir(self.automl_dir_a) self.create_dir(self.automl_dir_b) path_a = os.path.join(self.automl_dir_a, self.automl_dir) path_b = os.path.join(self.automl_dir_b, self.automl_dir) X, y = datasets.make_regression( n_samples=100, n_features=5, n_informative=4, n_targets=1, shuffle=False, random_state=0, ) automl = AutoML( results_path=path_a, explain_level=0, ml_task="regression", total_time_limit=10, ) automl.fit(X, y) p = automl.predict(X[:3]) shutil.move(path_a, path_b) automl2 = AutoML(results_path=path_b, ) p2 = automl2.predict(X[:3]) for i in range(3): assert_almost_equal(p[i], p2[i])
def test_enough_time_to_train(self): model_type = "Xgboost" model_type_2 = "LightGBM" automl = AutoML( results_path=self.automl_dir, total_time_limit=10, # this parameter setting should be omitted algorithms=[model_type, model_type_2], ) for i in range(5): # should be always true self.assertTrue(automl._enough_time_to_train(model_type)) automl.log_train_time(model_type, 1)
def test_tune_only_default(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 2, self.rows) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, tuning_mode="Insane", algorithms=["Xgboost"], ) automl.fit(X, y) self.assertEqual(len(automl._models), 1)
def test_save_load(self): a = AutoML( results_path=self.automl_dir, total_time_limit=10, explain_level=0, mode="Explain", train_ensemble=True, start_random_models=1, ) X, y = datasets.make_classification( n_samples=100, n_features=5, n_informative=4, n_redundant=1, n_classes=2, n_clusters_per_class=3, n_repeated=0, shuffle=False, random_state=0, ) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) a.fit(X, y) p = a.predict(X) a2 = AutoML(results_path=self.automl_dir) p2 = a2.predict(X) self.assertTrue((p == p2).all())
def test_tune_only_default(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 2, self.rows) automl = AutoML( results_path=self.automl_dir, total_time_limit=3, algorithms=["Decision Tree"], explain_level=0, train_ensemble=False, ) automl.fit(X, y) # Get number of starting models n1 = len([x for x in os.listdir(self.automl_dir) if x[0].isdigit()]) progress = json.load( open(os.path.join(self.automl_dir, "progress.json"), "r")) progress["fit_level"] = "default_algorithms" with open(os.path.join(self.automl_dir, "progress.json"), "w") as fout: fout.write(json.dumps(progress, indent=4)) automl = AutoML( results_path=self.automl_dir, total_time_limit=3, algorithms=["Decision Tree", "Xgboost"], explain_level=0, train_ensemble=False, ) automl.fit(X, y) # Get number of models after second fit n2 = len([x for x in os.listdir(self.automl_dir) if x[0].isdigit()]) self.assertGreater(n2, n1)
def test_set_model_time_limit(self): model_type = "Xgboost" automl = AutoML(results_path=self.automl_dir, model_time_limit=10, algorithms=[model_type]) automl._time_ctrl = TimeController( time.time(), None, 10, ["simple_algorithms", "not_so_random"], "Xgboost") for i in range(12): automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10) # should be always true self.assertTrue( automl._time_ctrl.enough_time(model_type, "not_so_random"))
def test_encoding_strange_characters(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.permutation(["ɛ", "🂲"] * int(self.rows / 2)) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Baseline"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y)
def test_one_column_input_regression(self): a = AutoML( results_path=self.automl_dir, total_time_limit=5, explain_level=0, start_random_models=1, ) X, y = datasets.make_regression(n_features=1) a.fit(X, y) p = a.predict(X) self.assertIsInstance(p, np.ndarray) self.assertEqual(len(p), X.shape[0])
def test_disable_stack_models_adjusted_validation(self): X = np.random.uniform(size=(100, 2)) y = np.random.randint(0, 2, size=(100, )) X[:, 0] = y X[:, 1] = -y automl = AutoML(results_path=self.automl_dir, total_time_limit=5, mode="Compete") automl.fit(X, y) # the stacking should be disabled # because of small time limit self.assertFalse(automl._stack_models) self.assertFalse(automl.tuner._stack_models) self.assertFalse(automl._time_ctrl._is_stacking)
def test_one_column_input_bin_class(self): a = AutoML( results_path=self.automl_dir, total_time_limit=5, explain_level=0, start_random_models=1, ) X = pd.DataFrame({"feature_1": np.random.rand(100)}) y = (np.random.rand(X.shape[0]) > 0.5).astype(int) a.fit(X, y) p = a.predict(X) self.assertIsInstance(p, np.ndarray) self.assertEqual(len(p), X.shape[0])
def test_empty_directory(self): """ Directory exists and is empty, use it """ # Assert directory does not exist self.assertTrue(not os.path.exists(self.automl_dir)) # Make dir os.mkdir(self.automl_dir) # Assert dir exists self.assertTrue(os.path.exists(self.automl_dir)) # Create automl with dir model = AutoML(results_path=self.automl_dir) # Generate data X, y = datasets.make_classification(n_samples=30) # Fit data model.fit( X, y) # AutoML only validates constructor params on `fit()` call self.assertTrue(os.path.exists(self.automl_dir))
def test_expected_learners_cnt(self): automl = AutoML(results_path=self.automl_dir) automl._validation_strategy = {"k_folds": 7, "repeats": 6} self.assertEqual(automl._expected_learners_cnt(), 42) automl._validation_strategy = {"k_folds": 7} self.assertEqual(automl._expected_learners_cnt(), 7) automl._validation_strategy = {} self.assertEqual(automl._expected_learners_cnt(), 1)
def test_disable_stack_models(self): X = np.random.uniform(size=(100, 2)) y = np.random.randint(0, 2, size=(100, )) X[:, 0] = y X[:, 1] = -y automl = AutoML( results_path=self.automl_dir, total_time_limit=5, mode="Compete", validation_strategy={"validation_type": "split"}, ) automl.fit(X, y) self.assertFalse(automl._stack_models) self.assertFalse(automl.tuner._stack_models) self.assertFalse(automl._time_ctrl._is_stacking)
def test_different_input_types(self): """ Test the different data input types for AutoML""" model = AutoML( total_time_limit=10, explain_level=0, start_random_models=1, algorithms=["Linear"], verbose=0, ) X, y = datasets.make_regression() # First test - X and y as numpy arrays pred = model.fit(X, y).predict(X) self.assertIsInstance(pred, np.ndarray) self.assertEqual(len(pred), X.shape[0]) del model model = AutoML( total_time_limit=10, explain_level=0, start_random_models=1, algorithms=["Linear"], verbose=0, ) # Second test - X and y as pandas dataframe X_pandas = pd.DataFrame(X) y_pandas = pd.DataFrame(y) pred_pandas = model.fit(X_pandas, y_pandas).predict(X_pandas) self.assertIsInstance(pred_pandas, np.ndarray) self.assertEqual(len(pred_pandas), X.shape[0]) del model model = AutoML( total_time_limit=10, explain_level=0, start_random_models=1, algorithms=["Linear"], verbose=0, ) # Third test - X and y as lists X_list = pd.DataFrame(X).values.tolist() y_list = pd.DataFrame(y).values.tolist() pred_list = model.fit(X_pandas, y_pandas).predict(X_pandas) self.assertIsInstance(pred_list, np.ndarray) self.assertEqual(len(pred_list), X.shape[0])
def test_dont_use_directory_if_non_empty_exists_without_params_json(self): """ Directory exists and is not empty, dont use it, raise exception """ os.mkdir(self.automl_dir) open(os.path.join(self.automl_dir, "test.file"), "w").close() self.assertTrue(os.path.exists(self.automl_dir)) with self.assertRaises(AutoMLException) as context: a = AutoML(results_path=self.automl_dir) self.assertTrue("not empty" in str(context.exception))
def test_category_data_type(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 2, self.rows) X["f1"] = X["f1"].astype("category") automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["CatBoost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y)
def test_repeated_kfold(self): REPEATS = 3 FOLDS = 2 a = AutoML( results_path=self.automl_dir, total_time_limit=10, algorithms=["Random Forest"], train_ensemble=False, validation_strategy={ "validation_type": "kfold", "k_folds": FOLDS, "repeats": REPEATS, "shuffle": True, "stratify": True, }, start_random_models=1, ) X, y = datasets.make_classification( n_samples=100, n_features=5, n_informative=4, n_redundant=1, n_classes=2, n_clusters_per_class=3, n_repeated=0, shuffle=False, random_state=0, ) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) a.fit(X, y) result_files = os.listdir( os.path.join(self.automl_dir, "1_Default_RandomForest")) cnt = 0 for repeat in range(REPEATS): for fold in range(FOLDS): learner_name = construct_learner_name(fold, repeat, REPEATS) self.assertTrue( f"{learner_name}.random_forest" in result_files) self.assertTrue(f"{learner_name}_training.log" in result_files) cnt += 1 self.assertTrue(cnt, 6)
def test_multi_class_0123(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 4, self.rows * 4) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, ) automl.set_advanced(start_random_models=1) automl.fit(X, y) pred = automl.predict(X) for col in [ "prediction_0", "prediction_1", "prediction_2", "prediction_3", "label", ]: self.assertTrue(col in pred.columns.tolist()) u = np.unique(pred["label"].values) self.assertTrue("0" in u or "1" in u or "2" in u or "3" in u) self.assertTrue(len(u) <= 4)
def test_multi_class_abcd_missing_target(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.permutation(["a", "B", "CC", "d"] * self.rows), name="target") y.iloc[1] = None automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, ) automl.set_advanced(start_random_models=1) automl.fit(X, y) pred = automl.predict(X) for col in [ "prediction_a", "prediction_B", "prediction_CC", "prediction_d", "label", ]: self.assertTrue(col in pred.columns.tolist()) u = np.unique(pred["label"].values) self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0) self.assertTrue(len(u) <= 4)