def test_extensive_eda(self): """ Test for extensive_eda feature """ X, y = datasets.make_regression(n_samples=100, n_features=5) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) y = pd.Series(y, name="class") results_path = self.automl_dir EDA.extensive_eda(X, y, results_path) result_files = os.listdir(results_path) for col in X.columns: self.assertTrue(f"{col}_target.png" in result_files) self.assertTrue("heatmap.png" in result_files) self.assertTrue("Extensive_EDA.md" in result_files) X, y = datasets.make_classification(n_samples=100, n_features=5) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) y = pd.Series(y, name="class") results_path = self.automl_dir EDA.extensive_eda(X, y, results_path) result_files = os.listdir(results_path) for col in X.columns: self.assertTrue(f"{col}_target.png" in result_files) self.assertTrue("heatmap.png" in result_files) self.assertTrue("Extensive_EDA.md" in result_files) self.tearDown()
def test_naughty_column_name_to_filename(self): """ Test with naughty strings. String from https://github.com/minimaxir/big-list-of-naughty-strings """ os.mkdir(self.automl_dir) naughty_columns = [ "feature_1", "*", "😍", "¯\_(ツ)_/¯", "表", "𠜎𠜱𠝹𠱓", "عاملة بولندا", "Ṱ̺̺̕o͞ ̷" "🇸🇦🇫🇦🇲", "⁰⁴⁵", "∆˚¬…æ", "!@#$%^&*()`~", "onfocus=JaVaSCript:alert(123) autofocus", "`\"'><img src=xxx:x \x20onerror=javascript:alert(1)>", 'System("ls -al /")', 'Kernel.exec("ls -al /")', "لُلُصّبُلُل" "{% print 'x' * 64 * 1024**3 %}", '{{ "".__class__.__mro__[2].__subclasses__()[40]("/etc/passwd").read() }}', "ÜBER Über German Umlaut", "影師嗎", "C'est déjà l'été." "Nín hǎo. Wǒ shì zhōng guó rén", "Компьютер", "jaja---lol-méméméoo--a", ] for col in naughty_columns: fname = EDA.plot_path(self.automl_dir, col) with open(fname, "w") as fout: fout.write("ok")
def test_symbol_feature(self): """ Test for columns with forbidden filenames """ X, y = datasets.make_regression(n_samples=100, n_features=5) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) X.rename({"f_0": "ff*", "f_1": "fg/"}, axis=1, inplace=True) y = pd.Series(y, name="class") results_path = self.automl_dir EDA.extensive_eda(X, y, results_path) result_files = os.listdir(results_path) for col in X.columns: self.assertTrue(EDA.plot_fname(f"{col}_target") in result_files) self.assertTrue("heatmap.png" in result_files) self.assertTrue("Extensive_EDA.md" in result_files) self.tearDown()
def test_extensive_eda_missing(self): """ Test for dataframe with missing values """ X, y = datasets.make_regression(n_samples=100, n_features=5) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) y = pd.Series(y, name="class") ##add some nan values X.loc[np.random.randint(0, 100, 20), "f_0"] = np.nan results_path = self.automl_dir EDA.extensive_eda(X, y, results_path) result_files = os.listdir(results_path) for col in X.columns: self.assertTrue(f"{col}_target.png" in result_files) self.assertTrue("heatmap.png" in result_files) self.assertTrue("Extensive_EDA.md" in result_files) X, y = datasets.make_regression(n_samples=100, n_features=5) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) y = pd.Series(y, name="class") ##add some nan values X.loc[np.random.randint(0, 100, 20), "f_0"] = np.nan results_path = self.automl_dir EDA.extensive_eda(X, y, results_path) result_files = os.listdir(results_path) for col in X.columns: self.assertTrue(f"{col}_target.png" in result_files) self.assertTrue("heatmap.png" in result_files) self.assertTrue("Extensive_EDA.md" in result_files) self.tearDown()
def _fit(self, X, y): """Fits the AutoML model with data""" if self._fit_level == "finished": print( "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new a 'fit()'." ) return # Validate input and build dataframes X, y = self._build_dataframe(X, y) self.n_features_in_ = X.shape[1] self.n_classes = len(np.unique(y[~pd.isnull(y)])) # Get attributes (__init__ params) self._mode = self._get_mode() self._ml_task = self._get_ml_task() self._results_path = self._get_results_path() self._total_time_limit = self._get_total_time_limit() self._model_time_limit = self._get_model_time_limit() self._algorithms = self._get_algorithms() self._train_ensemble = self._get_train_ensemble() self._stack_models = self._get_stack_models() self._eval_metric = self._get_eval_metric() self._validation_strategy = self._get_validation_strategy() self._verbose = self._get_verbose() self._explain_level = self._get_explain_level() self._golden_features = self._get_golden_features() self._feature_selection = self._get_feature_selection() self._start_random_models = self._get_start_random_models() self._hill_climbing_steps = self._get_hill_climbing_steps() self._top_models_to_improve = self._get_top_models_to_improve() self._random_state = self._get_random_state() try: self.load_progress() if self._fit_level == "finished": print( "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'." ) return self._check_can_load() self.verbose_print(f"AutoML directory: {self._results_path}") self.verbose_print( f"The task is {self._ml_task} with evaluation metric {self._eval_metric}" ) self.verbose_print( f"AutoML will use algorithms: {self._algorithms}") if self._stack_models: self.verbose_print("AutoML will stack models") if self._train_ensemble: self.verbose_print("AutoML will ensemble availabe models") self._start_time = time.time() if self._time_ctrl is not None: self._start_time -= self._time_ctrl.already_spend() # Automatic Exloratory Data Analysis if self._explain_level == 2: EDA.compute(X, y, os.path.join(self._results_path, "EDA")) # Save data self._save_data(X.copy(deep=False), y) tuner = MljarTuner( self._get_tuner_params( self._start_random_models, self._hill_climbing_steps, self._top_models_to_improve, ), self._algorithms, self._ml_task, self._validation_strategy, self._explain_level, self._data_info, self._golden_features, self._feature_selection, self._train_ensemble, self._stack_models, self._random_state, ) self.tuner = tuner steps = tuner.steps() self.verbose_print(f"AutoML steps: {steps}") if self._time_ctrl is None: self._time_ctrl = TimeController( self._start_time, self._total_time_limit, self._model_time_limit, steps, self._algorithms, ) self._time_ctrl.log_time( "prepare_data", "prepare_data", "prepare_data", time.time() - self._start_time, ) for step in steps: self._fit_level = step start = time.time() # self._time_start[step] = start if step == "stack": self.prepare_for_stacking() generated_params = [] if step in self._all_params: generated_params = self._all_params[step] else: generated_params = tuner.generate_params( step, self._models, self._results_path, self._stacked_models) if generated_params is None or not generated_params: self.verbose_print( f"Skip {step} because no parameters were generated.") continue if generated_params: if "learner" in generated_params[ 0] and not self._time_ctrl.enough_time( generated_params[0]["learner"]["model_type"], self._fit_level): self.verbose_print( f"Skip {step} because of the time limit.") else: model_str = "models" if len( generated_params) > 1 else "model" self.verbose_print( f"* Step {step} will try to check up to {len(generated_params)} {model_str}" ) for params in generated_params: if params.get("status", "") in ["trained", "skipped", "error"]: self.verbose_print( f"{params['name']}: {params['status']}.") continue try: trained = False if "ensemble" in step: trained = self.ensemble_step( is_stacked=params["is_stacked"]) else: trained = self.train_model(params) params["status"] = "trained" if trained else "skipped" params["final_loss"] = self._models[-1].get_final_loss( ) params["train_time"] = self._models[-1].get_train_time( ) except Exception as e: self._update_errors_report(params.get("name"), str(e)) params["status"] = "error" self.save_progress(step, generated_params) self._fit_level = "finished" self.save_progress() self.verbose_print( f"AutoML fit time: {np.round(time.time() - self._start_time,2)} seconds" ) except Exception as e: raise e finally: if self._X_path is not None: self._load_data_variables(X) return self
def fit(self, X_train, y_train, X_validation=None, y_validation=None): """ Fit AutoML :param X_train: Pandas DataFrame with training data. :param y_train: Numpy Array with target training data. :param X_validation: Pandas DataFrame with validation data. (Not implemented yet) :param y_validation: Numpy Array with target of validation data. (Not implemented yet) """ try: self.load_progress() if self._fit_level == "finished": print("AutoML is trained. Skipping fit step ...") return # if self._best_model is not None: # print("Best model is already set, no need to run fit. Skipping ...") # return self._start_time = time.time() if self._time_ctrl is not None: self._start_time -= self._time_ctrl.already_spend() if not isinstance(X_train, pd.DataFrame): raise AutoMLException( "AutoML needs X_train matrix to be a Pandas DataFrame" ) # Automatic Exloratory Data Analysis if self._explain_level == 2: EDA.compute(X_train, y_train, os.path.join(self._results_path, "EDA")) self._set_ml_task(y_train) if X_train is not None: X_train = X_train.copy(deep=False) X_train, y_train, X_validation, y_validation = self._initial_prep( X_train, y_train, X_validation, y_validation ) self._save_data(X_train, y_train, X_validation, y_validation) self._set_algorithms() self._set_metric() if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]: self._check_imbalanced(y_train) tuner = MljarTuner( self._tuner_params, self._algorithms, self._ml_task, self._validation, self._explain_level, self._data_info, self._golden_features, self._feature_selection, self._train_ensemble, self._stack_models, self._seed, ) self.tuner = tuner steps = tuner.steps() if self._time_ctrl is None: self._time_ctrl = TimeController( self._start_time, self._total_time_limit, self._model_time_limit, steps, self._algorithms, ) self._time_ctrl.log_time( "prepare_data", "prepare_data", "prepare_data", time.time() - self._start_time ) for step in steps: self._fit_level = step start = time.time() # self._time_start[step] = start if step == "stack": self.prepare_for_stacking() generated_params = [] if step in self._all_params: generated_params = self._all_params[step] else: generated_params = tuner.generate_params( step, self._models, self._results_path, self._stacked_models ) if generated_params is None: continue if generated_params: print("-" * 72) print(f"{step} with {len(generated_params)} models to train ...") for params in generated_params: if params.get("status", "") == "trained": print(f"Skipping {params['name']}, already trained.") continue if params.get("status", "") == "skipped": print(f"Skipped {params['name']}.") continue trained = False if "ensemble" in step: trained = self.ensemble_step(is_stacked=params["is_stacked"]) else: trained = self.train_model(params) params["status"] = "trained" if trained else "skipped" params["final_loss"] = self._models[-1].get_final_loss() params["train_time"] = self._models[-1].get_train_time() self.save_progress(step, generated_params) self._fit_level = "finished" self.save_progress() print(f"AutoML fit time: {time.time() - self._start_time}") except Exception as e: raise e finally: if self._X_train_path is not None: self._load_data_variables(X_train)
def test_column_name_to_filename(self): """ Valid feature name should be untouched """ col = "feature_1" self.assertEqual(EDA.prepare(col), col)
def _fit(self, X, y): """Fits the AutoML model with data""" if self._fit_level == "finished": return print( "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new a 'fit()'." ) # Validate input and build dataframes X, y = self._build_dataframe(X, y) self.n_features = X.shape[1] self.n_classes = len(np.unique(y[~pd.isnull(y)])) # Get attributes (__init__ params) self._mode = self._get_mode() self._ml_task = self._get_ml_task() self._tuning_mode = self._get_tuning_mode() self._results_path = self._get_results_path() self._total_time_limit = self._get_total_time_limit() self._model_time_limit = self._get_model_time_limit() self._algorithms = self._get_algorithms() self._train_ensemble = self._get_train_ensemble() self._stack_models = self._get_stack_models() self._eval_metric = self._get_eval_metric() self._validation_strategy = self._get_validation_strategy() self._verbose = self._get_verbose() self._explain_level = self._get_explain_level() self._golden_features = self._get_golden_features() self._feature_selection = self._get_feature_selection() self._start_random_models = self._get_start_random_models() self._hill_climbing_steps = self._get_hill_climbing_steps() self._top_models_to_improve = self._get_top_models_to_improve() self._random_state = self._get_random_state() try: self.load_progress() if self._fit_level == "finished": print( "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'." ) return # Validate input and build dataframes X, y = self._build_dataframe(X, y) self.n_features = X.shape[1] self.n_classes = len(np.unique(y[~pd.isnull(y)])) self.verbose_print( f"AutoML current directory: {self._results_path}") self.verbose_print(f"AutoML current task: {self._ml_task}") self.verbose_print( f"AutoML will use algorithms : {self._algorithms}") self.verbose_print( f"AutoML will optimize for metric : {self._eval_metric}") self._start_time = time.time() if self._time_ctrl is not None: self._start_time -= self._time_ctrl.already_spend() # Automatic Exloratory Data Analysis if self._explain_level == 2: EDA.compute(X, y, os.path.join(self._results_path, "EDA")) # Save data self._save_data(X, y) tuner = MljarTuner( self._get_tuner_params( self._start_random_models, self._hill_climbing_steps, self._top_models_to_improve, ), self._algorithms, self._ml_task, self._validation_strategy, self._explain_level, self._data_info, self._golden_features, self._feature_selection, self._train_ensemble, self._stack_models, self._random_state, ) self.tuner = tuner steps = tuner.steps() self.verbose_print(f"AutoML steps: {steps}") if self._time_ctrl is None: self._time_ctrl = TimeController( self._start_time, self._total_time_limit, self._model_time_limit, steps, self._algorithms, ) self._time_ctrl.log_time( "prepare_data", "prepare_data", "prepare_data", time.time() - self._start_time, ) for step in steps: self._fit_level = step start = time.time() # self._time_start[step] = start if step == "stack": self.prepare_for_stacking() generated_params = [] if step in self._all_params: generated_params = self._all_params[step] else: generated_params = tuner.generate_params( step, self._models, self._results_path, self._stacked_models) if generated_params is None: continue if generated_params: self.verbose_print("-" * 72) self.verbose_print( f"{step} with {len(generated_params)} models to train ..." ) for params in generated_params: if params.get("status", "") == "trained": self.verbose_print( f"Skipping {params['name']}, already trained.") continue if params.get("status", "") == "skipped": self.verbose_print(f"Skipped {params['name']}.") continue trained = False if "ensemble" in step: trained = self.ensemble_step( is_stacked=params["is_stacked"]) else: trained = self.train_model(params) params["status"] = "trained" if trained else "skipped" params["final_loss"] = self._models[-1].get_final_loss() params["train_time"] = self._models[-1].get_train_time() self.save_progress(step, generated_params) self._fit_level = "finished" self.save_progress() self.verbose_print( f"AutoML fit time: {time.time() - self._start_time}") except Exception as e: raise e finally: if self._X_path is not None: self._load_data_variables(X) return self