def test_enough_time_for_stacking(self): for t in [5, 10, 20]: tc = TimeController( start_time=time.time(), total_time_limit=100, model_time_limit=None, steps=[ "default_algorithms", "not_so_random", "golden_features", "insert_random_feature", "features_selection", "hill_climbing_1", "hill_climbing_3", "hill_climbing_5", "ensemble", "stack", "ensemble_stacked", ], algorithms=["Xgboost"], ) tc.log_time("1_Xgboost", "Xgboost", "default_algorithms", t) tc.log_time("2_Xgboost", "Xgboost", "not_so_random", t) tc.log_time("3_Xgboost", "Xgboost", "insert_random_feature", t) tc.log_time("4_Xgboost", "Xgboost", "features_selection", t) tc.log_time("5_Xgboost", "Xgboost", "hill_climbing_1", t) tc.log_time("6_Xgboost", "Xgboost", "hill_climbing_2", t) tc.log_time("7_Xgboost", "Xgboost", "hill_climbing_3", t) tc._start_time = time.time() - 7 * t assert_almost_equal(tc.already_spend(), 7 * t) if t < 20: self.assertTrue(tc.enough_time("Xgboost", "stack")) else: self.assertFalse(tc.enough_time("Xgboost", "stack")) self.assertTrue( tc.enough_time("Ensemble_Stacked", "ensemble_stacked"))
class BaseAutoML(BaseEstimator, ABC): """ Automated Machine Learning for supervised tasks (binary classification, multiclass classification, regression). Warning: This class should not be used directly. Use derived classes instead. """ def __init__(self): logger.debug("BaseAutoML.__init__") self._results_path = None self._models = [ ] # instances of iterative learner framework or ensemble self._best_model = None self._verbose = True self._threshold = None # used only in classification self._metrics_details = None self._max_metrics = None self._confusion_matrix = None self._X_path, self._y_path = None, None self._data_info = None self._model_paths = [] self._stacked_models = None self._fit_level = None self._start_time = time.time() self._time_ctrl = None self._all_params = {} # https://scikit-learn.org/stable/developers/develop.html#universal-attributes self.n_features_in_ = None # for scikit-learn api def _get_tuner_params(self, start_random_models, hill_climbing_steps, top_models_to_improve): return { "start_random_models": start_random_models, "hill_climbing_steps": hill_climbing_steps, "top_models_to_improve": top_models_to_improve, } def _check_can_load(self): """ Checks if AutoML can be loaded from a folder""" if self.results_path is not None: # Dir exists and can be loaded if os.path.exists(self.results_path) and os.path.exists( os.path.join(self.results_path, "params.json")): self.load(self.results_path) self._results_path = self.results_path def load(self, path): logger.info("Loading AutoML models ...") try: params = json.load(open(os.path.join(path, "params.json"))) self._model_paths = params["saved"] self._ml_task = params["ml_task"] self._eval_metric = params["eval_metric"] stacked_models = params.get("stacked") models_map = {} for model_path in self._model_paths: if model_path.endswith("Ensemble") or model_path.endswith( "Ensemble_Stacked"): ens = Ensemble.load(model_path, models_map) self._models += [ens] models_map[ens.get_name()] = ens else: m = ModelFramework.load(model_path) self._models += [m] models_map[m.get_name()] = m if stacked_models is not None: self._stacked_models = [] for stacked_model_name in stacked_models: self._stacked_models += [models_map[stacked_model_name]] best_model_name = None with open(os.path.join(path, "best_model.txt"), "r") as fin: best_model_name = fin.read() self._best_model = models_map[best_model_name] data_info_path = os.path.join(path, "data_info.json") self._data_info = json.load(open(data_info_path)) self.n_features_in_ = self._data_info["n_features"] if "n_classes" in self._data_info: self.n_classes = self._data_info["n_classes"] self._fit_level = "finished" except Exception as e: raise AutoMLException(f"Cannot load AutoML directory. {str(e)}") def get_leaderboard(self): ldb = { "name": [], "model_type": [], "metric_type": [], "metric_value": [], "train_time": [], } for m in self._models: ldb["name"] += [m.get_name()] ldb["model_type"] += [m.get_type()] ldb["metric_type"] += [self._eval_metric] ldb["metric_value"] += [m.get_final_loss()] ldb["train_time"] += [np.round(m.get_train_time(), 2)] return pd.DataFrame(ldb) def keep_model(self, model, model_path): if model is None: return self._models += [model] self._model_paths += [model_path] self.select_and_save_best() self.verbose_print("{} {} {} trained in {} seconds".format( model.get_name(), self._eval_metric, np.round(model.get_final_loss(), 6), np.round(model.get_train_time(), 2), )) self._time_ctrl.log_time(model.get_name(), model.get_type(), self._fit_level, model.get_train_time()) def create_dir(self, model_path): if not os.path.exists(model_path): try: os.mkdir(model_path) except Exception as e: raise AutoMLException( f"Cannot create directory {model_path}. {str(e)}") def train_model(self, params): # do we have enough time to train? # if not, skip if not self._time_ctrl.enough_time(params["learner"]["model_type"], self._fit_level): logger.info( f"Cannot train {params['name']} because of the time constraint" ) return False # let's create directory to log all training artifacts model_path = os.path.join(self._results_path, params["name"]) self.create_dir(model_path) # prepare callbacks early_stop = EarlyStopping({ "metric": { "name": self._eval_metric }, "log_to_dir": model_path }) learner_time_constraint = LearnerTimeConstraint({ "learner_time_limit": self._time_ctrl.learner_time_limit( params["learner"]["model_type"], self._fit_level, self._validation_strategy.get("k_folds", 1.0), ), "min_steps": params["additional"].get("min_steps"), }) total_time_constraint = TotalTimeConstraint({ "total_time_limit": self._total_time_limit if self._model_time_limit is None else None, "total_time_start": self._start_time, }) # create model framework mf = ModelFramework( params, callbacks=[ early_stop, learner_time_constraint, total_time_constraint ], ) # start training logger.info( f"Train model #{len(self._models)+1} / Model name: {params['name']}" ) mf.train(model_path) # save the model mf.save(model_path) # and keep info about the model self.keep_model(mf, model_path) return True def verbose_print(self, msg): if self._verbose > 0: # self._progress_bar.write(msg) print(msg) def ensemble_step(self, is_stacked=False): if self._train_ensemble and len(self._models) > 1: ensemble_path = os.path.join( self._results_path, "Ensemble_Stacked" if is_stacked else "Ensemble") self.create_dir(ensemble_path) self.ensemble = Ensemble(self._eval_metric, self._ml_task, is_stacked=is_stacked) oofs, target = self.ensemble.get_oof_matrix(self._models) self.ensemble.fit(oofs, target) self.ensemble.save(ensemble_path) self.keep_model(self.ensemble, ensemble_path) return True return False def can_we_stack_them(self, y): # if multiclass and too many classes then No return True def get_stacked_data(self, X, mode="training"): # mode can be `training` or `predict` if self._stacked_models is None: return X all_oofs = [] for m in self._stacked_models: oof = None if mode == "training": oof = m.get_out_of_folds() else: oof = m.predict(X) if self._ml_task == BINARY_CLASSIFICATION: cols = [f for f in oof.columns if "prediction" in f] if len(cols) == 2: oof = pd.DataFrame({"prediction": oof[cols[1]]}) cols = [f for f in oof.columns if "prediction" in f] oof = oof[cols] oof.columns = [f"{m.get_name()}_{c}" for c in cols] all_oofs += [oof] org_index = X.index.copy() X.reset_index(drop=True, inplace=True) X_stacked = pd.concat(all_oofs + [X], axis=1) X_stacked.index = org_index.copy() X.index = org_index.copy() return X_stacked def _perform_model_stacking(self): if self._stacked_models is not None: return ldb = self.get_leaderboard() ldb = ldb.sort_values(by="metric_value", ascending=True) models_map = { m.get_name(): m for m in self._models if not m._is_stacked } self._stacked_models = [] models_limit = 10 for model_type in np.unique(ldb.model_type): if model_type in ["Baseline"]: continue ds = ldb[ldb.model_type == model_type].copy() ds.sort_values(by="metric_value", inplace=True) for n in list(ds.name.iloc[:models_limit].values): self._stacked_models += [models_map[n]] scores = [m.get_final_loss() for m in self._stacked_models] self._stacked_models = [ self._stacked_models[i] for i in np.argsort(scores).tolist() ] def prepare_for_stacking(self): # print("Stacked models ....") # do we have enough models? if len(self._models) < 5: return # do we have time? if self._total_time_limit is not None: time_left = self._total_time_limit - (time.time() - self._start_time) # we need at least 60 seconds to do anything if time_left < 60: return self._perform_model_stacking() X_stacked_path = os.path.join(self._results_path, "X_stacked.parquet") if os.path.exists(X_stacked_path): return X = pd.read_parquet(self._X_path) org_columns = X.columns.tolist() X_stacked = self.get_stacked_data(X) new_columns = X_stacked.columns.tolist() added_columns = [c for c in new_columns if c not in org_columns] # save stacked train data X_stacked.to_parquet(X_stacked_path, index=False) """ # resue old params for m in self._stacked_models: # print(m.get_type()) # use only Xgboost, LightGBM and CatBoost as stacked models if m.get_type() not in ["Xgboost", "LightGBM", "CatBoost"]: continue params = copy.deepcopy(m.params) params["validation"]["X_train_path"] = X_train_stacked_path params["name"] = params["name"] + "_Stacked" params["is_stacked"] = True # print(params) if "model_architecture_json" in params["learner"]: # the new model will be created with wider input size del params["learner"]["model_architecture_json"] if self._ml_task == REGRESSION: # scale added predictions in regression if the target was scaled (in the case of NN) target_preprocessing = params["preprocessing"]["target_preprocessing"] scale = None if "scale_log_and_normal" in target_preprocessing: scale = "scale_log_and_normal" elif "scale_normal" in target_preprocessing: scale = "scale_normal" if scale is not None: for col in added_columns: params["preprocessing"]["columns_preprocessing"][col] = [ scale] self.train_model(params) """ def _save_data(self, X, y): self._X_path = os.path.join(self._results_path, "X.parquet") self._y_path = os.path.join(self._results_path, "y.parquet") X.to_parquet(self._X_path, index=False) # let's check before any conversions target_is_numeric = pd.api.types.is_numeric_dtype(y) if self._ml_task == MULTICLASS_CLASSIFICATION: y = y.astype(str) pd.DataFrame({"target": y}).to_parquet(self._y_path, index=False) self._validation_strategy["X_path"] = self._X_path self._validation_strategy["y_path"] = self._y_path self._validation_strategy["results_path"] = self._results_path columns_and_target_info = DataInfo.compute(X, y, self._ml_task) self.n_features_in_ = X.shape[1] self.n_classes = len(np.unique(y[~pd.isnull(y)])) self._data_info = { "columns": X.columns.tolist(), "rows": y.shape[0], "cols": X.shape[1], "target_is_numeric": target_is_numeric, "columns_info": columns_and_target_info["columns_info"], "target_info": columns_and_target_info["target_info"], "n_features": self.n_features_in_, } # Add n_classes if not regression if self._ml_task != REGRESSION: self._data_info["n_classes"] = self.n_classes if columns_and_target_info.get("num_class") is not None: self._data_info["num_class"] = columns_and_target_info["num_class"] data_info_path = os.path.join(self._results_path, "data_info.json") with open(data_info_path, "w") as fout: fout.write(json.dumps(self._data_info, indent=4)) self._drop_data_variables(X) def _drop_data_variables(self, X): X.drop(X.columns, axis=1, inplace=True) def _load_data_variables(self, X_train): if X_train.shape[1] == 0: X = pd.read_parquet(self._X_path) for c in X.columns: X_train.insert(loc=X_train.shape[1], column=c, value=X[c]) os.remove(self._X_path) os.remove(self._y_path) def save_progress(self, step=None, generated_params=None): if step is not None and generated_params is not None: self._all_params[step] = generated_params state = {} state["fit_level"] = self._fit_level state["time_controller"] = self._time_ctrl.to_json() state["all_params"] = self._all_params fname = os.path.join(self._results_path, "progress.json") with open(fname, "w") as fout: fout.write(json.dumps(state, indent=4)) def load_progress(self): state = {} fname = os.path.join(self._results_path, "progress.json") if not os.path.exists(fname): return state = json.load(open(fname, "r")) self._fit_level = state.get("fit_level", self._fit_level) self._all_params = state.get("all_params", self._all_params) self._time_ctrl = TimeController.from_json( state.get("time_controller")) def _validate_X_predict(self, X): """Validate X whenever one tries to predict, apply, predict_proba""" # X = check_array(X, ensure_2d=False) X = np.atleast_2d(X) n_features = X.shape[1] if self.n_features_in_ != n_features: raise ValueError( f"Number of features of the model must match the input. Model n_features_in_ is {self.n_features_in_} and input n_features is {n_features}. Reshape your data." ) # This method builds pandas.Dataframe from input. The input can be numpy.ndarray, matrix, or pandas.Dataframe # This method is used to build dataframes in `fit()` and in `predict`. That's the reason y can be None (`predict()` method) def _build_dataframe(self, X, y=None): # If Inputs are not pandas dataframes use scikit-learn validation for X array if not isinstance(X, pd.DataFrame): # Validate X as array X = check_array(X, ensure_2d=False) # Force X to be 2D X = np.atleast_2d(X) # Create Pandas dataframe from np.arrays, columns get names with the schema: feature_{index} X = pd.DataFrame( X, columns=["feature_" + str(i) for i in range(1, len(X[0]) + 1)]) # Enforce column names # Enforce X_train columns to be string X.columns = X.columns.astype(str) X.reset_index(drop=True, inplace=True) if y is None: return X # Check if y is np.ndarray, transform to pd.Series if isinstance(y, np.ndarray): y = check_array(y, ensure_2d=False) y = pd.Series(np.array(y), name="target") # if pd.DataFrame, slice first column elif isinstance(y, pd.DataFrame): y = np.array(y.iloc[:, 0]) y = check_array(y, ensure_2d=False) y = pd.Series(np.array(y), name="target") X, y = ExcludeRowsMissingTarget.transform(X, y, warn=True) X.reset_index(drop=True, inplace=True) y.reset_index(drop=True, inplace=True) return X, y def _fit(self, X, y): """Fits the AutoML model with data""" if self._fit_level == "finished": print( "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new a 'fit()'." ) return # Validate input and build dataframes X, y = self._build_dataframe(X, y) self.n_features_in_ = X.shape[1] self.n_classes = len(np.unique(y[~pd.isnull(y)])) # Get attributes (__init__ params) self._mode = self._get_mode() self._ml_task = self._get_ml_task() self._results_path = self._get_results_path() self._total_time_limit = self._get_total_time_limit() self._model_time_limit = self._get_model_time_limit() self._algorithms = self._get_algorithms() self._train_ensemble = self._get_train_ensemble() self._stack_models = self._get_stack_models() self._eval_metric = self._get_eval_metric() self._validation_strategy = self._get_validation_strategy() self._verbose = self._get_verbose() self._explain_level = self._get_explain_level() self._golden_features = self._get_golden_features() self._feature_selection = self._get_feature_selection() self._start_random_models = self._get_start_random_models() self._hill_climbing_steps = self._get_hill_climbing_steps() self._top_models_to_improve = self._get_top_models_to_improve() self._random_state = self._get_random_state() try: self.load_progress() if self._fit_level == "finished": print( "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'." ) return self._check_can_load() self.verbose_print(f"AutoML directory: {self._results_path}") self.verbose_print( f"The task is {self._ml_task} with evaluation metric {self._eval_metric}" ) self.verbose_print( f"AutoML will use algorithms: {self._algorithms}") if self._stack_models: self.verbose_print("AutoML will stack models") if self._train_ensemble: self.verbose_print("AutoML will ensemble availabe models") self._start_time = time.time() if self._time_ctrl is not None: self._start_time -= self._time_ctrl.already_spend() # Automatic Exloratory Data Analysis if self._explain_level == 2: EDA.compute(X, y, os.path.join(self._results_path, "EDA")) # Save data self._save_data(X.copy(deep=False), y) tuner = MljarTuner( self._get_tuner_params( self._start_random_models, self._hill_climbing_steps, self._top_models_to_improve, ), self._algorithms, self._ml_task, self._validation_strategy, self._explain_level, self._data_info, self._golden_features, self._feature_selection, self._train_ensemble, self._stack_models, self._random_state, ) self.tuner = tuner steps = tuner.steps() self.verbose_print(f"AutoML steps: {steps}") if self._time_ctrl is None: self._time_ctrl = TimeController( self._start_time, self._total_time_limit, self._model_time_limit, steps, self._algorithms, ) self._time_ctrl.log_time( "prepare_data", "prepare_data", "prepare_data", time.time() - self._start_time, ) for step in steps: self._fit_level = step start = time.time() # self._time_start[step] = start if step == "stack": self.prepare_for_stacking() generated_params = [] if step in self._all_params: generated_params = self._all_params[step] else: generated_params = tuner.generate_params( step, self._models, self._results_path, self._stacked_models) if generated_params is None or not generated_params: self.verbose_print( f"Skip {step} because no parameters were generated.") continue if generated_params: if "learner" in generated_params[ 0] and not self._time_ctrl.enough_time( generated_params[0]["learner"]["model_type"], self._fit_level): self.verbose_print( f"Skip {step} because of the time limit.") else: model_str = "models" if len( generated_params) > 1 else "model" self.verbose_print( f"* Step {step} will try to check up to {len(generated_params)} {model_str}" ) for params in generated_params: if params.get("status", "") in ["trained", "skipped", "error"]: self.verbose_print( f"{params['name']}: {params['status']}.") continue try: trained = False if "ensemble" in step: trained = self.ensemble_step( is_stacked=params["is_stacked"]) else: trained = self.train_model(params) params["status"] = "trained" if trained else "skipped" params["final_loss"] = self._models[-1].get_final_loss( ) params["train_time"] = self._models[-1].get_train_time( ) except Exception as e: self._update_errors_report(params.get("name"), str(e)) params["status"] = "error" self.save_progress(step, generated_params) self._fit_level = "finished" self.save_progress() self.verbose_print( f"AutoML fit time: {np.round(time.time() - self._start_time,2)} seconds" ) except Exception as e: raise e finally: if self._X_path is not None: self._load_data_variables(X) return self def _update_errors_report(self, model_name, error_msg): """Append error message to errors.md file. """ errors_filename = os.path.join(self._get_results_path(), "errors.md") with open(errors_filename, "a") as fout: self.verbose_print( f"There was an error during {model_name} training.") self.verbose_print(f"Please check {errors_filename} for details.") fout.write(f"## Error for {model_name}\n\n") fout.write(error_msg) link = "https://github.com/mljar/mljar-supervised/issues/new" fout.write( f"\n\nPlease set a GitHub issue with above error message at: {link}" ) fout.write("\n\n") def select_and_save_best(self): # Select best model (lowest loss) self._best_model = min(self._models, key=lambda x: x.get_final_loss()) with open(os.path.join(self._results_path, "best_model.txt"), "w") as fout: fout.write(f"{self._best_model.get_name()}") with open(os.path.join(self._results_path, "params.json"), "w") as fout: params = { "ml_task": self._ml_task, "eval_metric": self._eval_metric, "saved": self._model_paths, } if self._stacked_models is not None: params["stacked"] = [ m.get_name() for m in self._stacked_models ] fout.write(json.dumps(params, indent=4)) ldb = self.get_leaderboard() ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False) # save report ldb["Link"] = [ f"[Results link]({m}/README.md)" for m in ldb["name"].values ] ldb.insert(loc=0, column="Best model", value="") ldb.loc[ldb.name == self._best_model.get_name(), "Best model"] = "**the best**" with open(os.path.join(self._results_path, "README.md"), "w") as fout: fout.write(f"# AutoML Leaderboard\n\n") fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe")) LeaderboardPlots.compute(ldb, self._results_path, fout) def _check_is_fitted(self): # First check if model can be loaded self._check_can_load() # Check if fitted if self._fit_level != "finished": raise AutoMLException( "This model has not been fitted yet. Please call `fit()` first." ) def _base_predict(self, X): self._check_is_fitted() X = self._build_dataframe(X) if not isinstance(X.columns[0], str): X.columns = [str(c) for c in X.columns] input_columns = X.columns.tolist() for column in self._data_info["columns"]: if column not in input_columns: raise AutoMLException( f"Missing column: {column} in input data. Cannot predict") X = X[self._data_info["columns"]] self._validate_X_predict(X) # is stacked model if self._best_model._is_stacked: self._perform_model_stacking() X_stacked = self.get_stacked_data(X, mode="predict") if self._best_model.get_type() == "Ensemble": # Ensemble is using both original and stacked data predictions = self._best_model.predict(X, X_stacked) else: predictions = self._best_model.predict(X_stacked) else: predictions = self._best_model.predict(X) if self._ml_task == BINARY_CLASSIFICATION: # need to predict the label based on predictions and threshold neg_label, pos_label = ( predictions.columns[0][11:], predictions.columns[1][11:], ) if neg_label == "0" and pos_label == "1": neg_label, pos_label = 0, 1 target_is_numeric = self._data_info.get("target_is_numeric", False) if target_is_numeric: neg_label = int() pos_label = int(pos_label) # assume that it is binary classification predictions[ "label"] = predictions.iloc[:, 1] > self._best_model._threshold predictions["label"] = predictions["label"].map({ True: pos_label, False: neg_label }) return predictions elif self._ml_task == MULTICLASS_CLASSIFICATION: target_is_numeric = self._data_info.get("target_is_numeric", False) if target_is_numeric: predictions["label"] = predictions["label"].astype(np.int32) return predictions # Regression else: return predictions def _predict(self, X): predictions = self._base_predict(X) # Return predictions # If classification task the result is in column 'label' # If regression task the result is in column 'prediction' return (predictions["label"].to_numpy() if self._ml_task != REGRESSION else predictions["prediction"].to_numpy()) def _predict_proba(self, X): # Check is task type is correct if self._ml_task == REGRESSION: raise AutoMLException( f"Method `predict_proba()` can only be used when in classification tasks. Current task: '{self._ml_task}'." ) # Make and return predictions # If classification task the result is in column 'label' # Need to drop `label` column. return self._base_predict(X).drop(["label"], axis=1).to_numpy() def _predict_all(self, X): # Check is task type is correct if self._ml_task == REGRESSION: raise AutoMLException( f"Method `predict_all()` can only be used when in classification tasks. Current task: '{self._ml_task}'." ) # Make and return predictions return self._base_predict(X) def _score(self, X, y=None): # y default must be None for scikit-learn compatibility # Check if y is None if y is None: raise AutoMLException("y must be specified.") predictions = self._predict(X) return (r2_score(y, predictions) if self._ml_task == REGRESSION else accuracy_score(y, predictions)) def _get_mode(self): """ Gets the current mode""" self._validate_mode() return deepcopy(self.mode) def _get_ml_task(self): """ Gets the current ml_task. If "auto" it is determined""" self._validate_ml_task() if self.ml_task == "auto": classes_number = self.n_classes if classes_number == 2: self._estimator_type = "classifier" # for sk-learn api return BINARY_CLASSIFICATION elif classes_number <= 20: self._estimator_type = "classifier" # for sk-learn api return MULTICLASS_CLASSIFICATION else: self._estimator_type = "regressor" # for sk-learn api return REGRESSION else: return deepcopy(self.ml_task) def _get_results_path(self): """ Gets the current results_path""" # if we already have the results path set, please return it if self._results_path is not None: return self._results_path self._validate_results_path() path = self.results_path if path is None: for i in range(1, 10001): name = f"AutoML_{i}" if not os.path.exists(name): self.create_dir(name) self._results_path = name return name # If it got here, could not create, raise expection raise AutoMLException("Cannot create directory for AutoML results") elif os.path.exists(self.results_path) and os.path.exists( os.path.join( self.results_path, "params.json")): # AutoML already loaded, return path self._results_path = path return path # Dir does not exist, create it elif not os.path.exists(path): self.create_dir(path) self._results_path = path return path # Dir exists and is empty, use it elif os.path.exists(path) and not len(os.listdir(path)): self._results_path = path return path elif os.path.exists(path) and len(os.listdir(path)): raise AutoMLException( f"Cannot set directory for AutoML. Directory '{path}' is not empty." ) raise AutoMLException("Cannot set directory for AutoML results") def _get_total_time_limit(self): """ Gets the current total_time_limit""" self._validate_total_time_limit() return deepcopy(self.total_time_limit) def _get_model_time_limit(self): """ Gets the current model_time_limit""" self._validate_model_time_limit() return deepcopy(self.model_time_limit) def _get_algorithms(self): """ Gets the current algorithms. If "auto" it is determined""" self._validate_algorithms() if self.algorithms == "auto": if self._get_mode() == "Explain": return [ "Baseline", "Linear", "Decision Tree", "Random Forest", "Xgboost", "Neural Network", ] if self._get_mode() == "Perform": return [ "Linear", "Random Forest", "LightGBM", "Xgboost", "CatBoost", "Neural Network", ] if self._get_mode() == "Compete": return [ "Linear", "Decision Tree", "Random Forest", "Extra Trees", "LightGBM", "Xgboost", "CatBoost", "Neural Network", "Nearest Neighbors", ] else: return deepcopy(self.algorithms) def _get_train_ensemble(self): """ Gets the current train_ensemble""" self._validate_train_ensemble() return deepcopy(self.train_ensemble) def _get_stack_models(self): """ Gets the current stack_models""" self._validate_stack_models() if self.stack_models == "auto": return True if self.mode == "Compete" else False else: return deepcopy(self.stack_models) def _get_eval_metric(self): """ Gets the current eval_metric""" self._validate_eval_metric() if self.eval_metric == "auto": if self._get_ml_task() == BINARY_CLASSIFICATION: return "logloss" elif self._get_ml_task() == MULTICLASS_CLASSIFICATION: return "logloss" elif self._get_ml_task() == REGRESSION: return "rmse" else: return deepcopy(self.eval_metric) def _get_validation_strategy(self): """ Gets the current validation_strategy""" strat = {} self._validate_validation_strategy() if self.validation_strategy == "auto": if self._get_mode() == "Explain": strat = { "validation_type": "split", "train_ratio": 0.75, "shuffle": True, "stratify": True, } elif self._get_mode() == "Perform": strat = { "validation_type": "kfold", "k_folds": 5, "shuffle": True, "stratify": True, } elif self._get_mode() == "Compete": strat = { "validation_type": "kfold", "k_folds": 10, "shuffle": True, "stratify": True, } if self._get_ml_task() == REGRESSION: if "stratify" in strat: # it's better to always check # before delete (trust me) del strat["stratify"] return strat else: strat = deepcopy(self.validation_strategy) if "stratify" in strat: del strat["stratify"] return strat def _get_verbose(self): """Gets the current verbose""" self._validate_verbose() return deepcopy(self.verbose) def _get_explain_level(self): """ Gets the current explain_level""" self._validate_explain_level() if self.explain_level == "auto": if self._get_mode() == "Explain": return 2 if self._get_mode() == "Perform": return 1 if self._get_mode() == "Compete": return 0 else: return deepcopy(self.explain_level) def _get_golden_features(self): self._validate_golden_features() if self.golden_features == "auto": if self._get_mode() == "Explain": return False if self._get_mode() == "Perform": return True if self._get_mode() == "Compete": return True else: return deepcopy(self.golden_features) def _get_feature_selection(self): """ Gets the current feature_selection""" self._validate_feature_selection() if self.feature_selection == "auto": if self._get_mode() == "Explain": return False if self._get_mode() == "Perform": return True if self._get_mode() == "Compete": return True else: return deepcopy(self.feature_selection) def _get_start_random_models(self): """ Gets the current start_random_models""" self._validate_start_random_models() if self.start_random_models == "auto": if self._get_mode() == "Explain": return 1 if self._get_mode() == "Perform": return 5 if self._get_mode() == "Compete": return 10 else: return deepcopy(self.start_random_models) def _get_hill_climbing_steps(self): """ Gets the current hill_climbing_steps""" self._validate_hill_climbing_steps() if self.hill_climbing_steps == "auto": if self._get_mode() == "Explain": return 0 if self._get_mode() == "Perform": return 2 if self._get_mode() == "Compete": return 2 else: return deepcopy(self.hill_climbing_steps) def _get_top_models_to_improve(self): """ Gets the current top_models_to_improve""" self._validate_top_models_to_improve() if self.top_models_to_improve == "auto": if self._get_mode() == "Explain": return 0 if self._get_mode() == "Perform": return 2 if self._get_mode() == "Compete": return 3 else: return deepcopy(self.top_models_to_improve) def _get_random_state(self): """ Gets the current random_state""" self._validate_random_state() return deepcopy(self.random_state) def _validate_mode(self): """ Validates mode parameter""" valid_modes = ["Explain", "Perform", "Compete"] if self.mode not in valid_modes: raise ValueError( f"Expected 'mode' to be {' or '.join(valid_modes)}, got '{self.mode}'" ) def _validate_ml_task(self): """ Validates ml_task parameter""" if isinstance(self.ml_task, str) and self.ml_task == "auto": return if self.ml_task not in AlgorithmsRegistry.get_supported_ml_tasks(): raise ValueError( f"Expected 'ml_task' to be {' or '.join(AlgorithmsRegistry.get_supported_ml_tasks())}, got '{self.ml_task}''" ) def _validate_results_path(self): """ Validates path parameter""" if self.results_path is None or isinstance(self.results_path, str): return raise ValueError( f"Expected 'results_path' to be of type string, got '{type(self.results_path)}''" ) def _validate_total_time_limit(self): """ Validates total_time_limit parameter""" check_greater_than_zero_integer(self.total_time_limit, "total_time_limit") def _validate_model_time_limit(self): """ Validates model_time_limit parameter""" if self.model_time_limit is not None: check_greater_than_zero_integer(self.model_time_limit, "model_time_limit") def _validate_algorithms(self): """ Validates algorithms parameter""" if isinstance(self.algorithms, str) and self.algorithms == "auto": return for algo in self.algorithms: if algo not in list( AlgorithmsRegistry.registry[self._ml_task].keys()): raise ValueError( f"The algorithm {algo} is not allowed to use for ML task: {self._ml_task}. Allowed algorithms: {list(AlgorithmsRegistry.registry[self._ml_task].keys())}" ) def _validate_train_ensemble(self): """ Validates train_ensemble parameter""" # `train_ensemble` defaults to True, no further checking required check_bool(self.train_ensemble, "train_ensemble") def _validate_stack_models(self): """ Validates stack_models parameter""" # `stack_models` defaults to "auto". If "auto" return, else check if is valid bool if isinstance(self.stack_models, str) and self.stack_models == "auto": return check_bool(self.stack_models, "stack_models") def _validate_eval_metric(self): """ Validates eval_metric parameter""" # `stack_models` defaults to "auto". If not "auto", check if is valid bool if isinstance(self.eval_metric, str) and self.eval_metric == "auto": return if (self._get_ml_task() == BINARY_CLASSIFICATION or self._get_ml_task() == MULTICLASS_CLASSIFICATION ) and self.eval_metric != "logloss": raise ValueError( f"Metric {self.eval_metric} is not allowed in ML task: {self._get_ml_task()}. \ Use 'log_loss'") elif self._get_ml_task() == REGRESSION and self.eval_metric != "rmse": raise ValueError( f"Metric {self.eval_metric} is not allowed in ML task: {self._get_ml_task()}. \ Use 'rmse'") def _validate_validation_strategy(self): """ Validates validation parameter""" if (isinstance(self.validation_strategy, str) and self.validation_strategy == "auto"): return # only validation_type is mandatory # other parameters of validations # have defaults set in their constructors required_keys = ["validation_type"] if type(self.validation_strategy) is not dict: raise ValueError( f"Expected 'validation_strategy' to be a dict, got '{type(self.validation_strategy)}'" ) if not all(key in self.validation_strategy for key in required_keys): raise ValueError( f"Expected dict with keys: {' , '.join(required_keys)}") def _validate_verbose(self): """ Validates verbose parameter""" check_positive_integer(self.verbose, "verbose") def _validate_explain_level(self): """ Validates explain_level parameter""" if isinstance(self.explain_level, str) and self.explain_level == "auto": return valid_explain_levels = [0, 1, 2] # Check if explain level is 0 or greater integer if not (isinstance(self.explain_level, int) and self.explain_level in valid_explain_levels): raise ValueError( f"Expected 'explain_level' to be {' or '.join([str(x) for x in valid_explain_levels])}, got '{self.explain_level}'" ) def _validate_golden_features(self): """ Validates golden_features parameter""" if isinstance(self.golden_features, str) and self.golden_features == "auto": return check_bool(self.golden_features, "golden_features") def _validate_feature_selection(self): """ Validates feature_selection parameter""" if isinstance(self.feature_selection, str) and self.feature_selection == "auto": return check_bool(self.feature_selection, "feature_selection") def _validate_start_random_models(self): """ Validates start_random_models parameter""" if (isinstance(self.start_random_models, str) and self.start_random_models == "auto"): return check_greater_than_zero_integer(self.start_random_models, "start_random_models") def _validate_hill_climbing_steps(self): """ Validates hill_climbing_steps parameter""" if (isinstance(self.hill_climbing_steps, str) and self.hill_climbing_steps == "auto"): return check_positive_integer(self.hill_climbing_steps, "hill_climbing_steps") def _validate_top_models_to_improve(self): """ Validates top_models_to_improve parameter""" if (isinstance(self.top_models_to_improve, str) and self.top_models_to_improve == "auto"): return check_positive_integer(self.top_models_to_improve, "top_models_to_improve") def _validate_random_state(self): """ Validates random_state parameter""" check_positive_integer(self.random_state, "random_state") def to_json(self): if self._best_model is None: return None return { "best_model": self._best_model.to_json(), "threshold": self._threshold, "ml_task": self._ml_task, } def from_json(self, json_data): if json_data["best_model"]["algorithm_short_name"] == "Ensemble": self._best_model = Ensemble() self._best_model.from_json(json_data["best_model"]) else: self._best_model = ModelFramework( json_data["best_model"].get("params")) self._best_model.from_json(json_data["best_model"]) self._threshold = json_data.get("threshold") self._ml_task = json_data.get("ml_task")