def test_hill_climbing(self): models = [] models += [ ModelMock( "121_RandomForest", "Random Forest", 0.1, { "learner": { "max_features": 0.4, "model_type": "Random Forest" }, "preprocessing": {}, }, ) ] models += [ ModelMock( "1_RandomForest", "Random Forest", 0.1, { "learner": { "max_features": 0.4, "model_type": "Random Forest" }, "preprocessing": {}, }, ) ] tuner = MljarTuner( { "start_random_models": 0, "hill_climbing_steps": 1, "top_models_to_improve": 2, }, algorithms=["Random Foresrt"], ml_task="binary_classification", validation_strategy={}, explain_level=2, data_info={ "columns_info": [], "target_info": [] }, golden_features=False, features_selection=False, train_ensemble=False, stack_models=False, adjust_validation=False, seed=12, ) ind = 121 score = 0.1 for _ in range(5): for p in tuner.get_hill_climbing_params(models): models += [ModelMock(p["name"], "Random Forest", score, p)] score *= 0.1 self.assertTrue(int(p["name"].split("_")[0]) > ind) ind += 1
def test_hill_climbing(self): models = [] models += [ ModelMock( "model_121", "Random Forest", 0.1, { "learner": { "max_features": 0.4, "model_type": "Random Forest" }, "preprocessing": {}, }, ) ] models += [ ModelMock( "model_1", "Random Forest", 0.1, { "learner": { "max_features": 0.4, "model_type": "Random Forest" }, "preprocessing": {}, }, ) ] tuner = MljarTuner( { "start_random_models": 0, "hill_climbing_steps": 1, "top_models_to_improve": 2, }, algorithms=["Random Foresrt"], ml_task="binary_classification", validation={}, explain_level=2, seed=12, ) ind = 121 score = 0.1 for _ in range(5): for p in tuner.get_hill_climbing_params(models): models += [ModelMock(p["name"], "Random Forest", score, p)] score *= 0.1 self.assertTrue(int(p["name"].split("_")[1]) > ind) ind += 1
def fit(self, X_train, y_train, X_validation=None, y_validation=None): """ Fit AutoML :param X_train: Pandas DataFrame with training data. :param y_train: Numpy Array with target training data. :param X_validation: Pandas DataFrame with validation data. (Not implemented yet) :param y_validation: Numpy Array with target of validation data. (Not implemented yet) """ try: if self._best_model is not None: print( "Best model is already set, no need to run fit. Skipping ..." ) return self._start_time = time.time() if not isinstance(X_train, pd.DataFrame): raise AutoMLException( "AutoML needs X_train matrix to be a Pandas DataFrame") self._set_ml_task(y_train) if X_train is not None: X_train = X_train.copy(deep=False) X_train, y_train, X_validation, y_validation = self._initial_prep( X_train, y_train, X_validation, y_validation) self._save_data(X_train, y_train, X_validation, y_validation) self._set_algorithms() self._set_metric() # self._estimate_training_times() if self._ml_task in [ BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION ]: self._check_imbalanced(y_train) tuner = MljarTuner( self._tuner_params, self._algorithms, self._ml_task, self._validation, self._explain_level, self._data_info, self._seed, ) self.tuner = tuner self._time_spend = {} self._time_start = {} # 1. Check simple algorithms self._fit_level = "simple_algorithms" start = time.time() self._time_start[self._fit_level] = start for params in tuner.simple_algorithms_params(): self.train_model(params) self._time_spend["simple_algorithms"] = np.round( time.time() - start, 2) # 2. Default parameters self._fit_level = "default_algorithms" start = time.time() self._time_start[self._fit_level] = start for params in tuner.default_params(len(self._models)): self.train_model(params) self._time_spend["default_algorithms"] = np.round( time.time() - start, 2) # 3. The not-so-random step self._fit_level = "not_so_random" start = time.time() self._time_start[self._fit_level] = start generated_params = tuner.get_not_so_random_params(len( self._models)) for params in generated_params: self.train_model(params) self._time_spend["not_so_random"] = np.round( time.time() - start, 2) # 4. The hill-climbing step self._fit_level = "hill_climbing" start = time.time() self._time_start[self._fit_level] = start for params in tuner.get_hill_climbing_params(self._models): self.train_model(params) self._time_spend["hill_climbing"] = np.round( time.time() - start, 2) # 5. Ensemble unstacked models self._fit_level = "ensemble_unstacked" start = time.time() self._time_start[self._fit_level] = start self.ensemble_step() self._time_spend["ensemble_unstacked"] = np.round( time.time() - start, 2) if self._stack: # 6. Stack best models self._fit_level = "stack" start = time.time() self._time_start[self._fit_level] = start self.stacked_ensemble_step() self._time_spend["stack"] = np.round(time.time() - start, 2) # 7. Ensemble all models (original and stacked) any_stacked = False for m in self._models: if m._is_stacked: any_stacked = True break if any_stacked: self._fit_level = "ensemble_all" start = time.time() self.ensemble_step(is_stacked=True) self._time_spend["ensemble_all"] = np.round( time.time() - start, 2) self._fit_time = time.time() - self._start_time logger.info(f"AutoML fit time: {self._fit_time}") except Exception as e: raise e finally: if self._X_train_path is not None: self._load_data_variables(X_train)
def fit(self, X_train, y_train, X_validation=None, y_validation=None): """ Fit AutoML :param X_train: Pandas DataFrame with training data. :param y_train: Numpy Array with target training data. :param X_validation: Pandas DataFrame with validation data. (Not implemented yet) :param y_validation: Numpy Array with target of validation data. (Not implemented yet) """ try: if self._best_model is not None: print("Best model is already set, no need to run fit. Skipping ...") return start_time = time.time() if not isinstance(X_train, pd.DataFrame): raise AutoMLException( "AutoML needs X_train matrix to be a Pandas DataFrame" ) if X_train is not None: X_train = X_train.copy(deep=False) X_train, y_train, X_validation, y_validation = self._initial_prep( X_train, y_train, X_validation, y_validation ) self._save_data(X_train, y_train, X_validation, y_validation) self._set_ml_task(y_train) self._set_algorithms() self._set_metric() self._estimate_training_times() if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]: self._check_imbalanced(y_train) tuner = MljarTuner( self._tuner_params, self._algorithms, self._ml_task, self._validation, self._seed, ) # not so random step generated_params = tuner.get_not_so_random_params(X_train, y_train) self._del_data_variables(X_train, y_train) for params in generated_params: self.train_model(params) # hill climbing for params in tuner.get_hill_climbing_params(self._models): self.train_model(params) self.ensemble_step() max_loss = 10e12 for i, m in enumerate(self._models): if m.get_final_loss() < max_loss: self._best_model = m max_loss = m.get_final_loss() self.get_additional_metrics() self._fit_time = time.time() - start_time # self._progress_bar.close() with open(os.path.join(self._results_path, "best_model.txt"), "w") as fout: fout.write(f"{self._best_model.get_name()}") with open(os.path.join(self._results_path, "params.json"), "w") as fout: params = { "ml_task": self._ml_task, "optimize_metric": self._optimize_metric, "saved": self._model_paths, } fout.write(json.dumps(params, indent=4)) ldb = self.get_leaderboard() ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False) # save report ldb["Link"] = [f"[Results link]({m}/README.md)" for m in ldb["name"].values] ldb.insert(loc=0, column="Best model", value="") ldb.loc[ ldb.name == self._best_model.get_name(), "Best model" ] = "*** the best ***" with open(os.path.join(self._results_path, "README.md"), "w") as fout: fout.write(f"# AutoML Leaderboard\n\n") fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe")) except Exception as e: raise e finally: if self._X_train_path is not None: self._load_data_variables(X_train)
def fit(self, X_train, y_train, X_validation=None, y_validation=None): """ Fit AutoML :param X_train: Pandas DataFrame with training data. :param y_train: Numpy Array with target training data. :param X_validation: Pandas DataFrame with validation data. (Not implemented yet) :param y_validation: Numpy Array with target of validation data. (Not implemented yet) """ try: if self._best_model is not None: print( "Best model is already set, no need to run fit. Skipping ..." ) return self._start_time = time.time() if not isinstance(X_train, pd.DataFrame): raise AutoMLException( "AutoML needs X_train matrix to be a Pandas DataFrame") if X_train is not None: X_train = X_train.copy(deep=False) X_train, y_train, X_validation, y_validation = self._initial_prep( X_train, y_train, X_validation, y_validation) self._save_data(X_train, y_train, X_validation, y_validation) self._set_ml_task(y_train) self._set_algorithms() self._set_metric() self._estimate_training_times() if self._ml_task in [ BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION ]: self._check_imbalanced(y_train) tuner = MljarTuner( self._tuner_params, self._algorithms, self._ml_task, self._validation, self._explain_level, self._seed, ) # not so random step generated_params = tuner.get_not_so_random_params(X_train, y_train) self._del_data_variables(X_train, y_train) # Shuffle generated params # do not shuffle Baseline, Linear and Decision Trees dont_shuffle = [] to_shuffle = [] for p in generated_params: if p["learner"]["model_type"] in [ "Baseline", "Linear", "Decision Tree", ]: dont_shuffle += [p] else: to_shuffle += [p] np.random.shuffle(to_shuffle) generated_params = dont_shuffle + to_shuffle for params in generated_params: self.train_model(params) # hill climbing for params in tuner.get_hill_climbing_params(self._models): self.train_model(params) self.ensemble_step() self._fit_time = time.time() - self._start_time except Exception as e: raise e finally: if self._X_train_path is not None: self._load_data_variables(X_train)