def test_hill_climbing(self): models = [] models += [ ModelMock( "121_RandomForest", "Random Forest", 0.1, { "learner": { "max_features": 0.4, "model_type": "Random Forest" }, "preprocessing": {}, }, ) ] models += [ ModelMock( "1_RandomForest", "Random Forest", 0.1, { "learner": { "max_features": 0.4, "model_type": "Random Forest" }, "preprocessing": {}, }, ) ] tuner = MljarTuner( { "start_random_models": 0, "hill_climbing_steps": 1, "top_models_to_improve": 2, }, algorithms=["Random Foresrt"], ml_task="binary_classification", validation_strategy={}, explain_level=2, data_info={ "columns_info": [], "target_info": [] }, golden_features=False, features_selection=False, train_ensemble=False, stack_models=False, adjust_validation=False, seed=12, ) ind = 121 score = 0.1 for _ in range(5): for p in tuner.get_hill_climbing_params(models): models += [ModelMock(p["name"], "Random Forest", score, p)] score *= 0.1 self.assertTrue(int(p["name"].split("_")[0]) > ind) ind += 1
def test_hill_climbing(self): models = [] models += [ ModelMock( "model_121", "Random Forest", 0.1, { "learner": { "max_features": 0.4, "model_type": "Random Forest" }, "preprocessing": {}, }, ) ] models += [ ModelMock( "model_1", "Random Forest", 0.1, { "learner": { "max_features": 0.4, "model_type": "Random Forest" }, "preprocessing": {}, }, ) ] tuner = MljarTuner( { "start_random_models": 0, "hill_climbing_steps": 1, "top_models_to_improve": 2, }, algorithms=["Random Foresrt"], ml_task="binary_classification", validation={}, explain_level=2, seed=12, ) ind = 121 score = 0.1 for _ in range(5): for p in tuner.get_hill_climbing_params(models): models += [ModelMock(p["name"], "Random Forest", score, p)] score *= 0.1 self.assertTrue(int(p["name"].split("_")[1]) > ind) ind += 1
def test_key_params(self): params1 = { "preprocessing": {"p1": 1, "p2": 2}, "learner": {"p1": 1, "p2": 2}, "validation_strategy": {}, } params2 = { "preprocessing": {"p1": 1, "p2": 2}, "learner": {"p2": 2, "p1": 1}, "validation_strategy": {}, } key1 = MljarTuner.get_params_key(params1) key2 = MljarTuner.get_params_key(params2) self.assertEqual(key1, key2)
def _fit(self, X, y): """Fits the AutoML model with data""" if self._fit_level == "finished": print( "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new a 'fit()'." ) return # Validate input and build dataframes X, y = self._build_dataframe(X, y) self.n_features_in_ = X.shape[1] self.n_classes = len(np.unique(y[~pd.isnull(y)])) # Get attributes (__init__ params) self._mode = self._get_mode() self._ml_task = self._get_ml_task() self._results_path = self._get_results_path() self._total_time_limit = self._get_total_time_limit() self._model_time_limit = self._get_model_time_limit() self._algorithms = self._get_algorithms() self._train_ensemble = self._get_train_ensemble() self._stack_models = self._get_stack_models() self._eval_metric = self._get_eval_metric() self._validation_strategy = self._get_validation_strategy() self._verbose = self._get_verbose() self._explain_level = self._get_explain_level() self._golden_features = self._get_golden_features() self._feature_selection = self._get_feature_selection() self._start_random_models = self._get_start_random_models() self._hill_climbing_steps = self._get_hill_climbing_steps() self._top_models_to_improve = self._get_top_models_to_improve() self._random_state = self._get_random_state() try: self.load_progress() if self._fit_level == "finished": print( "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'." ) return self._check_can_load() self.verbose_print(f"AutoML directory: {self._results_path}") self.verbose_print( f"The task is {self._ml_task} with evaluation metric {self._eval_metric}" ) self.verbose_print( f"AutoML will use algorithms: {self._algorithms}") if self._stack_models: self.verbose_print("AutoML will stack models") if self._train_ensemble: self.verbose_print("AutoML will ensemble availabe models") self._start_time = time.time() if self._time_ctrl is not None: self._start_time -= self._time_ctrl.already_spend() # Automatic Exloratory Data Analysis if self._explain_level == 2: EDA.compute(X, y, os.path.join(self._results_path, "EDA")) # Save data self._save_data(X.copy(deep=False), y) tuner = MljarTuner( self._get_tuner_params( self._start_random_models, self._hill_climbing_steps, self._top_models_to_improve, ), self._algorithms, self._ml_task, self._validation_strategy, self._explain_level, self._data_info, self._golden_features, self._feature_selection, self._train_ensemble, self._stack_models, self._random_state, ) self.tuner = tuner steps = tuner.steps() self.verbose_print(f"AutoML steps: {steps}") if self._time_ctrl is None: self._time_ctrl = TimeController( self._start_time, self._total_time_limit, self._model_time_limit, steps, self._algorithms, ) self._time_ctrl.log_time( "prepare_data", "prepare_data", "prepare_data", time.time() - self._start_time, ) for step in steps: self._fit_level = step start = time.time() # self._time_start[step] = start if step == "stack": self.prepare_for_stacking() generated_params = [] if step in self._all_params: generated_params = self._all_params[step] else: generated_params = tuner.generate_params( step, self._models, self._results_path, self._stacked_models) if generated_params is None or not generated_params: self.verbose_print( f"Skip {step} because no parameters were generated.") continue if generated_params: if "learner" in generated_params[ 0] and not self._time_ctrl.enough_time( generated_params[0]["learner"]["model_type"], self._fit_level): self.verbose_print( f"Skip {step} because of the time limit.") else: model_str = "models" if len( generated_params) > 1 else "model" self.verbose_print( f"* Step {step} will try to check up to {len(generated_params)} {model_str}" ) for params in generated_params: if params.get("status", "") in ["trained", "skipped", "error"]: self.verbose_print( f"{params['name']}: {params['status']}.") continue try: trained = False if "ensemble" in step: trained = self.ensemble_step( is_stacked=params["is_stacked"]) else: trained = self.train_model(params) params["status"] = "trained" if trained else "skipped" params["final_loss"] = self._models[-1].get_final_loss( ) params["train_time"] = self._models[-1].get_train_time( ) except Exception as e: self._update_errors_report(params.get("name"), str(e)) params["status"] = "error" self.save_progress(step, generated_params) self._fit_level = "finished" self.save_progress() self.verbose_print( f"AutoML fit time: {np.round(time.time() - self._start_time,2)} seconds" ) except Exception as e: raise e finally: if self._X_path is not None: self._load_data_variables(X) return self
def fit(self, X_train, y_train, X_validation=None, y_validation=None): """ Fit AutoML :param X_train: Pandas DataFrame with training data. :param y_train: Numpy Array with target training data. :param X_validation: Pandas DataFrame with validation data. (Not implemented yet) :param y_validation: Numpy Array with target of validation data. (Not implemented yet) """ try: if self._best_model is not None: print( "Best model is already set, no need to run fit. Skipping ..." ) return self._start_time = time.time() if not isinstance(X_train, pd.DataFrame): raise AutoMLException( "AutoML needs X_train matrix to be a Pandas DataFrame") self._set_ml_task(y_train) if X_train is not None: X_train = X_train.copy(deep=False) X_train, y_train, X_validation, y_validation = self._initial_prep( X_train, y_train, X_validation, y_validation) self._save_data(X_train, y_train, X_validation, y_validation) self._set_algorithms() self._set_metric() # self._estimate_training_times() if self._ml_task in [ BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION ]: self._check_imbalanced(y_train) tuner = MljarTuner( self._tuner_params, self._algorithms, self._ml_task, self._validation, self._explain_level, self._data_info, self._seed, ) self.tuner = tuner self._time_spend = {} self._time_start = {} # 1. Check simple algorithms self._fit_level = "simple_algorithms" start = time.time() self._time_start[self._fit_level] = start for params in tuner.simple_algorithms_params(): self.train_model(params) self._time_spend["simple_algorithms"] = np.round( time.time() - start, 2) # 2. Default parameters self._fit_level = "default_algorithms" start = time.time() self._time_start[self._fit_level] = start for params in tuner.default_params(len(self._models)): self.train_model(params) self._time_spend["default_algorithms"] = np.round( time.time() - start, 2) # 3. The not-so-random step self._fit_level = "not_so_random" start = time.time() self._time_start[self._fit_level] = start generated_params = tuner.get_not_so_random_params(len( self._models)) for params in generated_params: self.train_model(params) self._time_spend["not_so_random"] = np.round( time.time() - start, 2) # 4. The hill-climbing step self._fit_level = "hill_climbing" start = time.time() self._time_start[self._fit_level] = start for params in tuner.get_hill_climbing_params(self._models): self.train_model(params) self._time_spend["hill_climbing"] = np.round( time.time() - start, 2) # 5. Ensemble unstacked models self._fit_level = "ensemble_unstacked" start = time.time() self._time_start[self._fit_level] = start self.ensemble_step() self._time_spend["ensemble_unstacked"] = np.round( time.time() - start, 2) if self._stack: # 6. Stack best models self._fit_level = "stack" start = time.time() self._time_start[self._fit_level] = start self.stacked_ensemble_step() self._time_spend["stack"] = np.round(time.time() - start, 2) # 7. Ensemble all models (original and stacked) any_stacked = False for m in self._models: if m._is_stacked: any_stacked = True break if any_stacked: self._fit_level = "ensemble_all" start = time.time() self.ensemble_step(is_stacked=True) self._time_spend["ensemble_all"] = np.round( time.time() - start, 2) self._fit_time = time.time() - self._start_time logger.info(f"AutoML fit time: {self._fit_time}") except Exception as e: raise e finally: if self._X_train_path is not None: self._load_data_variables(X_train)
def fit(self, X_train, y_train, X_validation=None, y_validation=None): """ Fit AutoML :param X_train: Pandas DataFrame with training data. :param y_train: Numpy Array with target training data. :param X_validation: Pandas DataFrame with validation data. (Not implemented yet) :param y_validation: Numpy Array with target of validation data. (Not implemented yet) """ try: self.load_progress() if self._fit_level == "finished": print("AutoML is trained. Skipping fit step ...") return # if self._best_model is not None: # print("Best model is already set, no need to run fit. Skipping ...") # return self._start_time = time.time() if self._time_ctrl is not None: self._start_time -= self._time_ctrl.already_spend() if not isinstance(X_train, pd.DataFrame): raise AutoMLException( "AutoML needs X_train matrix to be a Pandas DataFrame" ) # Automatic Exloratory Data Analysis if self._explain_level == 2: EDA.compute(X_train, y_train, os.path.join(self._results_path, "EDA")) self._set_ml_task(y_train) if X_train is not None: X_train = X_train.copy(deep=False) X_train, y_train, X_validation, y_validation = self._initial_prep( X_train, y_train, X_validation, y_validation ) self._save_data(X_train, y_train, X_validation, y_validation) self._set_algorithms() self._set_metric() if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]: self._check_imbalanced(y_train) tuner = MljarTuner( self._tuner_params, self._algorithms, self._ml_task, self._validation, self._explain_level, self._data_info, self._golden_features, self._feature_selection, self._train_ensemble, self._stack_models, self._seed, ) self.tuner = tuner steps = tuner.steps() if self._time_ctrl is None: self._time_ctrl = TimeController( self._start_time, self._total_time_limit, self._model_time_limit, steps, self._algorithms, ) self._time_ctrl.log_time( "prepare_data", "prepare_data", "prepare_data", time.time() - self._start_time ) for step in steps: self._fit_level = step start = time.time() # self._time_start[step] = start if step == "stack": self.prepare_for_stacking() generated_params = [] if step in self._all_params: generated_params = self._all_params[step] else: generated_params = tuner.generate_params( step, self._models, self._results_path, self._stacked_models ) if generated_params is None: continue if generated_params: print("-" * 72) print(f"{step} with {len(generated_params)} models to train ...") for params in generated_params: if params.get("status", "") == "trained": print(f"Skipping {params['name']}, already trained.") continue if params.get("status", "") == "skipped": print(f"Skipped {params['name']}.") continue trained = False if "ensemble" in step: trained = self.ensemble_step(is_stacked=params["is_stacked"]) else: trained = self.train_model(params) params["status"] = "trained" if trained else "skipped" params["final_loss"] = self._models[-1].get_final_loss() params["train_time"] = self._models[-1].get_train_time() self.save_progress(step, generated_params) self._fit_level = "finished" self.save_progress() print(f"AutoML fit time: {time.time() - self._start_time}") except Exception as e: raise e finally: if self._X_train_path is not None: self._load_data_variables(X_train)
def _fit(self, X, y): """Fits the AutoML model with data""" if self._fit_level == "finished": return print( "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new a 'fit()'." ) # Validate input and build dataframes X, y = self._build_dataframe(X, y) self.n_features = X.shape[1] self.n_classes = len(np.unique(y[~pd.isnull(y)])) # Get attributes (__init__ params) self._mode = self._get_mode() self._ml_task = self._get_ml_task() self._tuning_mode = self._get_tuning_mode() self._results_path = self._get_results_path() self._total_time_limit = self._get_total_time_limit() self._model_time_limit = self._get_model_time_limit() self._algorithms = self._get_algorithms() self._train_ensemble = self._get_train_ensemble() self._stack_models = self._get_stack_models() self._eval_metric = self._get_eval_metric() self._validation_strategy = self._get_validation_strategy() self._verbose = self._get_verbose() self._explain_level = self._get_explain_level() self._golden_features = self._get_golden_features() self._feature_selection = self._get_feature_selection() self._start_random_models = self._get_start_random_models() self._hill_climbing_steps = self._get_hill_climbing_steps() self._top_models_to_improve = self._get_top_models_to_improve() self._random_state = self._get_random_state() try: self.load_progress() if self._fit_level == "finished": print( "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'." ) return # Validate input and build dataframes X, y = self._build_dataframe(X, y) self.n_features = X.shape[1] self.n_classes = len(np.unique(y[~pd.isnull(y)])) self.verbose_print( f"AutoML current directory: {self._results_path}") self.verbose_print(f"AutoML current task: {self._ml_task}") self.verbose_print( f"AutoML will use algorithms : {self._algorithms}") self.verbose_print( f"AutoML will optimize for metric : {self._eval_metric}") self._start_time = time.time() if self._time_ctrl is not None: self._start_time -= self._time_ctrl.already_spend() # Automatic Exloratory Data Analysis if self._explain_level == 2: EDA.compute(X, y, os.path.join(self._results_path, "EDA")) # Save data self._save_data(X, y) tuner = MljarTuner( self._get_tuner_params( self._start_random_models, self._hill_climbing_steps, self._top_models_to_improve, ), self._algorithms, self._ml_task, self._validation_strategy, self._explain_level, self._data_info, self._golden_features, self._feature_selection, self._train_ensemble, self._stack_models, self._random_state, ) self.tuner = tuner steps = tuner.steps() self.verbose_print(f"AutoML steps: {steps}") if self._time_ctrl is None: self._time_ctrl = TimeController( self._start_time, self._total_time_limit, self._model_time_limit, steps, self._algorithms, ) self._time_ctrl.log_time( "prepare_data", "prepare_data", "prepare_data", time.time() - self._start_time, ) for step in steps: self._fit_level = step start = time.time() # self._time_start[step] = start if step == "stack": self.prepare_for_stacking() generated_params = [] if step in self._all_params: generated_params = self._all_params[step] else: generated_params = tuner.generate_params( step, self._models, self._results_path, self._stacked_models) if generated_params is None: continue if generated_params: self.verbose_print("-" * 72) self.verbose_print( f"{step} with {len(generated_params)} models to train ..." ) for params in generated_params: if params.get("status", "") == "trained": self.verbose_print( f"Skipping {params['name']}, already trained.") continue if params.get("status", "") == "skipped": self.verbose_print(f"Skipped {params['name']}.") continue trained = False if "ensemble" in step: trained = self.ensemble_step( is_stacked=params["is_stacked"]) else: trained = self.train_model(params) params["status"] = "trained" if trained else "skipped" params["final_loss"] = self._models[-1].get_final_loss() params["train_time"] = self._models[-1].get_train_time() self.save_progress(step, generated_params) self._fit_level = "finished" self.save_progress() self.verbose_print( f"AutoML fit time: {time.time() - self._start_time}") except Exception as e: raise e finally: if self._X_path is not None: self._load_data_variables(X) return self
def fit(self, X_train, y_train, X_validation=None, y_validation=None): """ Fit AutoML :param X_train: Pandas DataFrame with training data. :param y_train: Numpy Array with target training data. :param X_validation: Pandas DataFrame with validation data. (Not implemented yet) :param y_validation: Numpy Array with target of validation data. (Not implemented yet) """ try: if self._best_model is not None: print("Best model is already set, no need to run fit. Skipping ...") return start_time = time.time() if not isinstance(X_train, pd.DataFrame): raise AutoMLException( "AutoML needs X_train matrix to be a Pandas DataFrame" ) if X_train is not None: X_train = X_train.copy(deep=False) X_train, y_train, X_validation, y_validation = self._initial_prep( X_train, y_train, X_validation, y_validation ) self._save_data(X_train, y_train, X_validation, y_validation) self._set_ml_task(y_train) self._set_algorithms() self._set_metric() self._estimate_training_times() if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]: self._check_imbalanced(y_train) tuner = MljarTuner( self._tuner_params, self._algorithms, self._ml_task, self._validation, self._seed, ) # not so random step generated_params = tuner.get_not_so_random_params(X_train, y_train) self._del_data_variables(X_train, y_train) for params in generated_params: self.train_model(params) # hill climbing for params in tuner.get_hill_climbing_params(self._models): self.train_model(params) self.ensemble_step() max_loss = 10e12 for i, m in enumerate(self._models): if m.get_final_loss() < max_loss: self._best_model = m max_loss = m.get_final_loss() self.get_additional_metrics() self._fit_time = time.time() - start_time # self._progress_bar.close() with open(os.path.join(self._results_path, "best_model.txt"), "w") as fout: fout.write(f"{self._best_model.get_name()}") with open(os.path.join(self._results_path, "params.json"), "w") as fout: params = { "ml_task": self._ml_task, "optimize_metric": self._optimize_metric, "saved": self._model_paths, } fout.write(json.dumps(params, indent=4)) ldb = self.get_leaderboard() ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False) # save report ldb["Link"] = [f"[Results link]({m}/README.md)" for m in ldb["name"].values] ldb.insert(loc=0, column="Best model", value="") ldb.loc[ ldb.name == self._best_model.get_name(), "Best model" ] = "*** the best ***" with open(os.path.join(self._results_path, "README.md"), "w") as fout: fout.write(f"# AutoML Leaderboard\n\n") fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe")) except Exception as e: raise e finally: if self._X_train_path is not None: self._load_data_variables(X_train)
def fit(self, X_train, y_train, X_validation=None, y_validation=None): """ Fit AutoML :param X_train: Pandas DataFrame with training data. :param y_train: Numpy Array with target training data. :param X_validation: Pandas DataFrame with validation data. (Not implemented yet) :param y_validation: Numpy Array with target of validation data. (Not implemented yet) """ try: if self._best_model is not None: print( "Best model is already set, no need to run fit. Skipping ..." ) return self._start_time = time.time() if not isinstance(X_train, pd.DataFrame): raise AutoMLException( "AutoML needs X_train matrix to be a Pandas DataFrame") if X_train is not None: X_train = X_train.copy(deep=False) X_train, y_train, X_validation, y_validation = self._initial_prep( X_train, y_train, X_validation, y_validation) self._save_data(X_train, y_train, X_validation, y_validation) self._set_ml_task(y_train) self._set_algorithms() self._set_metric() self._estimate_training_times() if self._ml_task in [ BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION ]: self._check_imbalanced(y_train) tuner = MljarTuner( self._tuner_params, self._algorithms, self._ml_task, self._validation, self._explain_level, self._seed, ) # not so random step generated_params = tuner.get_not_so_random_params(X_train, y_train) self._del_data_variables(X_train, y_train) # Shuffle generated params # do not shuffle Baseline, Linear and Decision Trees dont_shuffle = [] to_shuffle = [] for p in generated_params: if p["learner"]["model_type"] in [ "Baseline", "Linear", "Decision Tree", ]: dont_shuffle += [p] else: to_shuffle += [p] np.random.shuffle(to_shuffle) generated_params = dont_shuffle + to_shuffle for params in generated_params: self.train_model(params) # hill climbing for params in tuner.get_hill_climbing_params(self._models): self.train_model(params) self.ensemble_step() self._fit_time = time.time() - self._start_time except Exception as e: raise e finally: if self._X_train_path is not None: self._load_data_variables(X_train)