def test_save_and_load(self): self.assertTrue( "Private" in list(self.data["train"]["X"]["workclass"])) early_stop = EarlyStopping({"metric": {"name": "logloss"}}) metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]}) il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger]) il.train(self.data) y_predicted = il.predict(self.data["train"]["X"]) metric = Metric({"name": "logloss"}) loss_1 = metric(self.data["train"]["y"], y_predicted) json_desc = il.to_json() il2 = IterativeLearner(self.train_params, callbacks=[]) self.assertTrue(il.uid != il2.uid) il2.from_json(json_desc) self.assertTrue(il.uid == il2.uid) y_predicted_2 = il2.predict(self.data["train"]["X"]) loss_2 = metric(self.data["train"]["y"], y_predicted_2) assert_almost_equal(loss_1, loss_2) uids = [i.uid for i in il.learners] uids2 = [i.uid for i in il2.learners] for u in uids: self.assertTrue(u in uids2)
def test_save_and_load(self): il = IterativeLearner(self.train_params, callbacks=[]) il.train(self.data) metric = Metric({"name": "logloss"}) loss = metric(self.y, il.predict(self.X)) json_desc = il.to_json() il2 = IterativeLearner(json_desc.get("params"), callbacks=[]) self.assertTrue(il.uid != il2.uid) il2.from_json(json_desc) self.assertTrue(il.uid == il2.uid) loss2 = metric(self.y, il2.predict(self.X)) assert_almost_equal(loss, loss2) uids = [i.uid for i in il.learners] uids2 = [i.uid for i in il2.learners] for u in uids: self.assertTrue(u in uids2)
def from_json(self, json_desc): self.library_version = json_desc.get("library_version", self.library_version) self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name) self.algorithm_short_name = json_desc.get("algorithm_short_name", self.algorithm_short_name) self.uid = json_desc.get("uid", self.uid) self.selected_models = [] models_json = json_desc.get("models") for selected in models_json: model = selected["model"] repeat = selected["repeat"] il = IterativeLearner(model.get("params")) il.from_json(model) self.selected_models += [ # {"model": LearnerFactory.load(model), "repeat": repeat} { "model": il, "repeat": repeat } ]
class AutoML: def __init__( self, total_time_limit=60 * 60, learner_time_limit=120, algorithms=["CatBoost", "Xgboost", "RF", "LightGBM", "NN"], start_random_models=10, hill_climbing_steps=3, top_models_to_improve=5, train_ensemble=True, verbose=True, ): self._total_time_limit = total_time_limit self._time_limit = ( learner_time_limit ) # time limit in seconds for single learner self._train_ensemble = train_ensemble self._models = [] # instances of iterative learner framework or ensemble self._models_params_keys = [] self._best_model = ( None ) # it is instance of iterative learner framework or ensemble self._validation = {"validation_type": "kfold", "k_folds": 5, "shuffle": True} self._start_random_models = start_random_models self._hill_climbing_steps = hill_climbing_steps self._top_models_to_improve = top_models_to_improve self._algorithms = algorithms self._verbose = verbose if self._total_time_limit is not None: estimated_models_to_check = ( len(self._algorithms) * ( self._start_random_models + self._top_models_to_improve * self._hill_climbing_steps * 2 ) * 5 ) # set time limit for single model training # the 0.85 is safe scale factor, to not exceed time limit self._time_limit = self._total_time_limit * 0.85 / estimated_models_to_check if len(self._algorithms) == 0: self._algorithms = list( ModelsRegistry.registry[BINARY_CLASSIFICATION].keys() ) self._fit_time = None self._models_train_time = {} self._threshold, self._metrics_details, self._max_metrics, self._confusion_matrix = ( None, None, None, None, ) def get_additional_metrics(self): # 'target' - the target after processing used for model training # 'prediction' - out of folds predictions of model oof_predictions = self._best_model.get_out_of_folds() self._metrics_details, self._max_metrics, self._confusion_matrix = ComputeAdditionalMetrics.compute( oof_predictions["target"], oof_predictions["prediction"], BINARY_CLASSIFICATION ) self._threshold = self._max_metrics["f1"]["threshold"] # print(self._metrics_details, self._max_metrics, self._confusion_matrix) def _get_model_params(self, model_type, X, y): model_info = ModelsRegistry.registry[BINARY_CLASSIFICATION][model_type] model_params = RandomParameters.get(model_info["params"]) required_preprocessing = model_info["required_preprocessing"] model_additional = model_info["additional"] preprocessing_params = PreprocessingTuner.get( required_preprocessing, {"train": {"X": X, "y": y}}, BINARY_CLASSIFICATION ) return { "additional": model_additional, "preprocessing": preprocessing_params, "validation": self._validation, "learner": { "model_type": model_info["class"].algorithm_short_name, **model_params, }, } def train_model(self, params, X, y): early_stop = EarlyStopping({"metric": {"name": "logloss"}}) time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit}) il = IterativeLearner(params, callbacks=[early_stop, time_constraint]) il_key = il.get_params_key() if il_key in self._models_params_keys: return None self._models_params_keys += [il_key] if self.should_train_next(il.get_name()): il.train({"train": {"X": X, "y": y}}) return il return None def verbose_print(self, msg): if self._verbose: print(msg) def log_train_time(self, model_type, train_time): if model_type in self._models_train_time: self._models_train_time[model_type] += [train_time] else: self._models_train_time[model_type] = [train_time] def should_train_next(self, model_type): # no time limit, just train, dont ask if self._total_time_limit is None: return True total_time_already_spend = ( 0 if model_type not in self._models_train_time else np.sum(self._models_train_time[model_type]) ) mean_time_already_spend = ( 0 if model_type not in self._models_train_time else np.mean(self._models_train_time[model_type]) ) if ( total_time_already_spend + mean_time_already_spend < 0.85 * self._total_time_limit / float(len(self._algorithms)) ): return True return False def not_so_random_step(self, X, y): for model_type in self._algorithms: for i in range(self._start_random_models): params = self._get_model_params(model_type, X, y) m = self.train_model(params, X, y) if m is not None: self._models += [m] self.verbose_print( "Learner {} final loss {} time {}".format( m.get_name(), m.get_final_loss(), m.get_train_time() ) ) self.log_train_time(m.get_name(), m.get_train_time()) def hill_climbing_step(self, X, y): for hill_climbing in range(self._hill_climbing_steps): # get models orderer by loss models = [] for m in self._models: models += [(m.callbacks.callbacks[0].final_loss, m)] models = sorted(models, key=lambda x: x[0]) for i in range(min(self._top_models_to_improve, len(models))): m = models[i][1] for p in HillClimbing.get(m.params.get("learner")): if p is not None: all_params = copy.deepcopy(m.params) all_params["learner"] = p new_model = self.train_model(all_params, X, y) if new_model is not None: self._models += [new_model] self.verbose_print( "Learner {} final loss {} time {}".format( new_model.get_name(), new_model.get_final_loss(), new_model.get_train_time(), ) ) self.log_train_time( new_model.get_name(), new_model.get_train_time() ) def ensemble_step(self, y): if self._train_ensemble: self.ensemble = Ensemble() X_oof = self.ensemble.get_oof_matrix(self._models) self.ensemble.fit(X_oof, y) self._models += [self.ensemble] self.verbose_print( "Learner {} final loss {} time {}".format( self.ensemble.get_name(), self.ensemble.get_final_loss(), self.ensemble.get_train_time(), ) ) self.log_train_time( self.ensemble.get_name(), self.ensemble.get_train_time() ) def fit(self, X, y): start_time = time.time() X.reset_index(drop=True, inplace=True) y = np.array(y) if not isinstance(y, pd.DataFrame): y = pd.DataFrame({"target": y}) y.reset_index(drop=True, inplace=True) y = y["target"] # drops rows with missing target X, y = PreprocessingExcludeMissingValues.transform(X, y) # start with not-so-random models self.not_so_random_step(X, y) # perform hill climbing steps on best models self.hill_climbing_step(X, y) # train ensemble self.ensemble_step(y) max_loss = 10e12 for i, m in enumerate(self._models): if m.get_final_loss() < max_loss: self._best_model = m max_loss = m.get_final_loss() self.get_additional_metrics() self._fit_time = time.time() - start_time def predict(self, X): if self._best_model is not None: predictions = self._best_model.predict(X) neg_label, pos_label = predictions.columns[0][2:], predictions.columns[1][2:] if neg_label == '0' and pos_label == '1': neg_label, pos_label = 0, 1 # assume that it is binary classification predictions['label'] = predictions.iloc[:, 1] > self._threshold predictions['label'] = predictions['label'].map({True: pos_label, False: neg_label}) return predictions #return pd.DataFrame( # { # "prediction": self._best_model.predict(X), # "label": self._best_model.predict(X) > self._threshold, # } #) return None def to_json(self): if self._best_model is None: return None return {"best_model": self._best_model.to_json(), "threshold": self._threshold} def from_json(self, json_data): # pretty sure that this can be easily refactored if json_data["best_model"]["algorithm_short_name"] == "Ensemble": self._best_model = Ensemble() self._best_model.from_json(json_data["best_model"]) else: self._best_model = IterativeLearner(json_data["best_model"].get("params")) self._best_model.from_json(json_data["best_model"]) self._threshold = json_data.get("threshold")