def __call__(self, trial): try: params = { "n_neighbors": trial.suggest_int("n_neighbors", 1, 128), "weights": trial.suggest_categorical( "weights", ["uniform", "distance"] ), "n_jobs": self.n_jobs, "rows_limit": 100000, "ml_task": self.ml_task, } Algorithm = ( KNeighborsRegressorAlgorithm if self.ml_task == REGRESSION else KNeighborsAlgorithm ) model = Algorithm(params) model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) preds = model.predict(self.X_validation) score = self.eval_metric(self.y_validation, preds) if Metric.optimize_negative(self.eval_metric.name): score *= -1.0 except optuna.exceptions.TrialPruned as e: raise e except Exception as e: print("Exception in KNNObjective", str(e)) return None return score
def plot_iterations(learner_names, metric_name, model_path, colors, trees_in_iteration=None): plt.figure(figsize=(10, 7)) for ln in learner_names: df = pd.read_csv( os.path.join(model_path, f"{ln}_training.log"), names=["iteration", "train", "test"], ) fold, repeat = learner_name_to_fold_repeat(ln) repeat_str = f" Reapeat {repeat+1}," if repeat is not None else "" # if trees_in_iteration is not None: # df.iteration = df.iteration * trees_in_iteration plt.plot( df.iteration, df.train, "--", color=colors[fold], label=f"Fold {fold+1},{repeat_str} train", ) any_none = np.sum(pd.isnull(df.test)) if any_none == 0: plt.plot( df.iteration, df.test, color=colors[fold], label=f"Fold {fold+1},{repeat_str} test", ) best_iter = None if Metric.optimize_negative(metric_name): best_iter = df.test.argmax() else: best_iter = df.test.argmin() if best_iter is not None and best_iter != -1: plt.axvline(best_iter, color=colors[fold], alpha=0.3) if trees_in_iteration is not None: plt.xlabel("#Trees") else: plt.xlabel("#Iteration") plt.ylabel(metric_name) # limit number of learners in the legend # too many will raise warnings if len(learner_names) <= 15: plt.legend(loc="best") plt.tight_layout(pad=2.0) plot_path = os.path.join(model_path, LearningCurves.output_file_name) plt.savefig(plot_path) plt.close("all")
def __call__(self, trial): param = { "objective": self.objective, "eval_metric": self.eval_metric_name, "tree_method": "hist", "booster": "gbtree", "eta": trial.suggest_categorical("eta", [0.0125, 0.025, 0.05, 0.1]), "max_depth": trial.suggest_int("max_depth", 2, 12), "lambda": trial.suggest_float("lambda", EPS, 10.0, log=True), "alpha": trial.suggest_float("alpha", EPS, 10.0, log=True), "colsample_bytree": min(trial.suggest_float("colsample_bytree", 0.3, 1.0 + EPS), 1.0), "subsample": min(trial.suggest_float("subsample", 0.3, 1.0 + EPS), 1.0), "min_child_weight": trial.suggest_int("min_child_weight", 1, 100), "n_jobs": self.n_jobs, "seed": self.seed, } if self.num_class is not None: param["num_class"] = self.num_class try: pruning_callback = optuna.integration.XGBoostPruningCallback( trial, f"validation-{self.eval_metric_name}") bst = xgb.train( param, self.dtrain, self.rounds, evals=[(self.dvalidation, "validation")], early_stopping_rounds=self.early_stopping_rounds, callbacks=[pruning_callback], verbose_eval=False, ) preds = bst.predict(self.dvalidation, ntree_limit=bst.best_ntree_limit) score = self.eval_metric(self.y_validation, preds) if Metric.optimize_negative(self.eval_metric.name): score *= -1.0 except optuna.exceptions.TrialPruned as e: raise e except Exception as e: print("Exception in XgboostObjective", str(e)) return None return score
def __init__( self, results_path, ml_task, eval_metric, time_budget=3600, init_params={}, verbose=True, n_jobs=-1, random_state=42, ): if eval_metric.name not in [ "auc", "logloss", "rmse", "mse", "mae", "mape", "r2", "spearman", "pearson", "f1", "average_precision", "accuracy", "user_defined_metric", ]: raise AutoMLException( f"Metric {eval_metric.name} is not supported") self.study_dir = os.path.join(results_path, "optuna") if not os.path.exists(self.study_dir): try: os.mkdir(self.study_dir) except Exception as e: print("Problem while creating directory for optuna studies.", str(e)) self.tuning_fname = os.path.join(self.study_dir, "optuna.json") self.tuning = init_params self.eval_metric = eval_metric self.direction = ("maximize" if Metric.optimize_negative( eval_metric.name) else "minimize") self.n_warmup_steps = ( 500 # set large enough to give small learning rates a chance ) self.time_budget = time_budget self.verbose = verbose self.ml_task = ml_task self.n_jobs = n_jobs self.random_state = random_state self.cat_features_indices = [] self.load() if not self.verbose: optuna.logging.set_verbosity(optuna.logging.CRITICAL)
def __init__( self, results_path, ml_task, eval_metric, time_budget=3600, init_params={}, verbose=True, n_jobs=-1, random_state=42, ): if eval_metric.name not in ["auc", "logloss", "rmse", "mae", "mape"]: raise AutoMLException( f"Metric {eval_metric.name} is not supported") self.study_dir = os.path.join(results_path, "optuna") if not os.path.exists(self.study_dir): try: os.mkdir(self.study_dir) except Exception as e: print("Problem while creating directory for optuna studies.", str(e)) self.tuning_fname = os.path.join(self.study_dir, "optuna.json") self.tuning = init_params self.eval_metric = eval_metric self.direction = ("maximize" if Metric.optimize_negative( eval_metric.name) else "minimize") self.n_warmup_steps = 500 # set large enough to give small learning rates a chance self.time_budget = time_budget self.verbose = verbose self.ml_task = ml_task self.n_jobs = n_jobs self.random_state = random_state self.cat_features_indices = [] data_info_fname = os.path.join(results_path, "data_info.json") if os.path.exists(data_info_fname): data_info = json.loads(open(data_info_fname).read()) for i, (k, v) in enumerate(data_info["columns_info"].items()): if "categorical" in v: self.cat_features_indices += [i] self.load() if not self.verbose: optuna.logging.set_verbosity(optuna.logging.CRITICAL)
def compute(ldb, model_path, fout): if ldb.shape[0] < 2: return # Scatter plot plt.figure(figsize=(10, 7)) plt.plot(ldb.metric_value, "*") plt.xlabel("#Iteration") plt.ylabel(ldb.metric_type.iloc[0]) plt.title("AutoML Performance") plt.tight_layout(pad=2.0) plot_path = os.path.join(model_path, LeaderboardPlots.performance_fname) plt.savefig(plot_path) plt.close("all") fout.write("\n\n### AutoML Performance\n") fout.write( f"![AutoML Performance]({LeaderboardPlots.performance_fname})") # Boxplot by = "model_type" column = "metric_value" df2 = pd.DataFrame( {col: vals[column] for col, vals in ldb.groupby(by)}) ascending_sort = Metric.optimize_negative(ldb.metric_type.iloc[0]) mins = df2.min().sort_values(ascending=ascending_sort) plt.figure(figsize=(10, 7)) # plt.title("") plt.ylabel(ldb.metric_type.iloc[0]) df2[mins.index].boxplot(rot=90, fontsize=12) plt.tight_layout(pad=2.0) plot_path = os.path.join(model_path, LeaderboardPlots.performance_boxplot_fname) plt.savefig(plot_path) plt.close("all") fout.write("\n\n### AutoML Performance Boxplot\n") fout.write( f"![AutoML Performance Boxplot]({LeaderboardPlots.performance_boxplot_fname})" )
def __call__(self, trial): try: Algorithm = (MLPRegressorAlgorithm if self.ml_task == REGRESSION else MLPAlgorithm) params = { "dense_1_size": trial.suggest_int("dense_1_size", 4, 100), "dense_2_size": trial.suggest_int("dense_2_size", 2, 100), "learning_rate": trial.suggest_categorical("learning_rate", [0.005, 0.01, 0.05, 0.1, 0.2]), "learning_rate_type": trial.suggest_categorical("learning_rate_type", ["constant", "adaptive"]), "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True), "seed": self.seed, "ml_task": self.ml_task, } model = Algorithm(params) model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) preds = model.predict(self.X_validation) score = self.eval_metric(self.y_validation, preds) if Metric.optimize_negative(self.eval_metric.name): score *= -1.0 except optuna.exceptions.TrialPruned as e: raise e except Exception as e: print("Exception in NeuralNetworkObjective", str(e)) return None return score
def __call__(self, trial): try: Algorithm = (ExtraTreesRegressorAlgorithm if self.ml_task == REGRESSION else ExtraTreesAlgorithm) self.objective = ("mse" if self.ml_task == REGRESSION else trial.suggest_categorical( "criterion", ["gini", "entropy"])) params = { "max_steps": self.max_steps, "criterion": self.objective, "max_depth": trial.suggest_int("max_depth", 2, 32), "min_samples_split": trial.suggest_int("min_samples_split", 2, 100), "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 100), "max_features": trial.suggest_float("max_features", 0.01, 1), "n_jobs": self.n_jobs, "seed": self.seed, "ml_task": self.ml_task, } model = Algorithm(params) model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) preds = model.predict(self.X_validation) score = self.eval_metric(self.y_validation, preds) if Metric.optimize_negative(self.eval_metric.name): score *= -1.0 except optuna.exceptions.TrialPruned as e: raise e except Exception as e: print("Exception in ExtraTreesObjective", str(e)) return None return score
def __call__(self, trial): try: Algorithm = (ExtraTreesRegressor if self.ml_task == REGRESSION else ExtraTreesClassifier) model = Algorithm( n_estimators=self.max_steps * 100, criterion=self.objective, max_depth=trial.suggest_int("max_depth", 2, 16), min_samples_split=trial.suggest_int("min_samples_split", 2, 100), min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 100), max_features=trial.suggest_float("max_features", 1e-8, 1), n_jobs=self.n_jobs, random_state=self.seed, ) model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) if self.ml_task == BINARY_CLASSIFICATION: preds = model.predict_proba(self.X_validation)[:, 1] elif self.ml_task == MULTICLASS_CLASSIFICATION: preds = model.predict_proba(self.X_validation) else: # REGRESSION preds = model.predict(self.X_validation) score = self.eval_metric(self.y_validation, preds) if Metric.optimize_negative(self.eval_metric.name): score *= -1.0 except optuna.exceptions.TrialPruned as e: raise e except Exception as e: print("Exception in ExtraTreesObjective", str(e)) return None return score
def __call__(self, trial): param = { "objective": self.objective, "metric": self.eval_metric_name, "verbosity": -1, "boosting_type": "gbdt", "learning_rate": trial.suggest_categorical("learning_rate", [0.0125, 0.025, 0.05, 0.1]), "num_leaves": trial.suggest_int("num_leaves", 2, 2048), "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True), "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True), "feature_fraction": min(trial.suggest_float("feature_fraction", 0.3, 1.0 + EPS), 1.0), "bagging_fraction": min(trial.suggest_float("bagging_fraction", 0.3, 1.0 + EPS), 1.0), "bagging_freq": trial.suggest_int("bagging_freq", 1, 7), "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100), "feature_pre_filter": False, "seed": self.seed, "num_threads": self.n_jobs, "extra_trees": trial.suggest_categorical("extra_trees", [True, False]), } if self.cat_features_indices: param["cat_feature"] = self.cat_features_indices param["cat_l2"] = trial.suggest_float("cat_l2", EPS, 100.0) param["cat_smooth"] = trial.suggest_float("cat_smooth", EPS, 100.0) if self.num_class is not None: param["num_class"] = self.num_class try: metric_name = self.eval_metric_name if metric_name == "custom": metric_name = self.custom_eval_metric_name pruning_callback = optuna.integration.LightGBMPruningCallback( trial, metric_name, "validation") gbm = lgb.train( param, self.dtrain, valid_sets=[self.dvalid], valid_names=["validation"], verbose_eval=False, callbacks=[pruning_callback], num_boost_round=self.rounds, early_stopping_rounds=self.early_stopping_rounds, feval=self.custom_eval_metric, ) preds = gbm.predict(self.X_validation) score = self.eval_metric(self.y_validation, preds) if Metric.optimize_negative(self.eval_metric.name): score *= -1.0 except optuna.exceptions.TrialPruned as e: raise e #except Exception as e: # print("Exception in LightgbmObjective", str(e)) # return None return score
def fit(self, oofs, y, sample_weight=None): logger.debug("Ensemble.fit") start_time = time.time() selected_algs_cnt = 0 # number of selected algorithms self.best_algs = [] # selected algoritms indices from each loop total_prediction_time = 0 best_sum = None # sum of best algorihtms for j in range(len(oofs)): # iterate over all solutions min_score = self.metric.get_maximum() best_model = None # try to add some algorithm to the best_sum to minimize metric for model_name in oofs.keys(): if (self._max_single_prediction_time and model_name in self.model_prediction_time): if (total_prediction_time + self.model_prediction_time[model_name] > self._max_single_prediction_time): continue y_ens = self._get_mean(oofs[model_name], best_sum, j + 1) score = self.metric(y, y_ens, sample_weight) if self.metric.improvement(previous=min_score, current=score): min_score = score best_model = model_name if best_model is None: continue # there is improvement, save it # save scores for plotting learning curve # if we optimize negative, then we need to multiply by -1.0 # to save correct values in the learning curve sign = -1.0 if Metric.optimize_negative(self.metric.name) else 1.0 self._scores += [sign * min_score] if self.metric.improvement(previous=self.best_loss, current=min_score): self.best_loss = min_score selected_algs_cnt = j self.best_algs.append(best_model) # save the best algoritm # update best_sum value best_sum = (oofs[best_model] if best_sum is None else best_sum + oofs[best_model]) if j == selected_algs_cnt: self.total_best_sum = copy.deepcopy(best_sum) # update prediction time estimate if self._max_single_prediction_time is not None: total_prediction_time = np.sum([ self.model_prediction_time[name] for name in np.unique(self.best_algs) ]) # end of main loop # if not self.best_algs: raise NotTrainedException("Ensemble wasn't fitted.") # keep oof predictions of ensemble self.total_best_sum /= float(selected_algs_cnt + 1) self.best_algs = self.best_algs[:(selected_algs_cnt + 1)] logger.debug("Selected models for ensemble:") for model_name in np.unique(self.best_algs): self.selected_models += [{ "model": self.models_map[model_name], "repeat": float(self.best_algs.count(model_name)), }] logger.debug(f"{model_name} {self.best_algs.count(model_name)}") self._additional_metrics = self.get_additional_metrics() self.train_time = time.time() - start_time
def __call__(self, trial): try: params = { "iterations": self.rounds, "learning_rate": trial.suggest_categorical("learning_rate", [0.05, 0.1, 0.2]), "depth": trial.suggest_int("depth", 2, 9), "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.0001, 10.0, log=False), "random_strength": trial.suggest_float("random_strength", EPS, 10.0, log=False), "rsm": trial.suggest_float("rsm", 0.1, 1), # colsample_bylevel=rsm "loss_function": self.objective, "eval_metric": self.eval_metric_name, "verbose": False, "allow_writing_files": False, "thread_count": self.n_jobs, "random_seed": self.seed, # "border_count": trial.suggest_int("border_count", 16, 2048), "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100), # "bootstrap_type": "Bernoulli" # trial.suggest_categorical( # "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"] # ), } # if params["bootstrap_type"] == "Bayesian": # params["bagging_temperature"] = trial.suggest_float( # "bagging_temperature", 0, 10 # ) # elif params["bootstrap_type"] in ["Bernoulli", "MVS"]: # params["subsample"] = trial.suggest_float("subsample", 0.1, 1) Algorithm = (CatBoostRegressor if self.ml_task == REGRESSION else CatBoostClassifier) if self.custom_eval_metric is not None: params["eval_metric"] = self.custom_eval_metric model = Algorithm(**params) model.fit( self.X_train, self.y_train, sample_weight=self.sample_weight, early_stopping_rounds=self.early_stopping_rounds, eval_set=self.eval_set, verbose_eval=False, cat_features=self.cat_features, ) if self.ml_task == BINARY_CLASSIFICATION: preds = model.predict_proba(self.X_validation, ntree_end=model.best_iteration_ + 1)[:, 1] elif self.ml_task == MULTICLASS_CLASSIFICATION: preds = model.predict_proba(self.X_validation, ntree_end=model.best_iteration_ + 1) else: # REGRESSION preds = model.predict(self.X_validation, ntree_end=model.best_iteration_ + 1) score = self.eval_metric(self.y_validation, preds) if Metric.optimize_negative(self.eval_metric.name): score *= -1.0 except optuna.exceptions.TrialPruned as e: raise e except Exception as e: print("Exception in CatBoostObjective", str(e)) return None return score
def on_iteration_end(self, logs, predictions): train_loss = 0 if predictions.get("y_train_predicted") is not None: train_loss = self.metric( predictions.get("y_train_true"), predictions.get("y_train_predicted"), predictions.get("sample_weight"), ) validation_loss = self.metric( predictions.get("y_validation_true"), predictions.get("y_validation_predicted"), predictions.get("sample_weight_validation"), ) self.loss_values[self.learner.uid]["train"] += [train_loss] self.loss_values[self.learner.uid]["validation"] += [validation_loss] self.loss_values[self.learner.uid]["iters"] += [logs.get("iter_cnt")] if self.metric.improvement(previous=self.best_loss[self.learner.uid], current=validation_loss): y_validation_true = predictions.get("y_validation_true") self.no_improvement_cnt = 0 self.best_iter[self.learner.uid] = logs.get("iter_cnt") self.best_loss[self.learner.uid] = validation_loss if len(y_validation_true.shape ) == 1 or y_validation_true.shape[1] == 1: self.best_y_predicted[self.learner.uid] = pd.DataFrame( { "target": np.array(y_validation_true) # y_validation_true.values.reshape( # y_validation_true.shape[0] # ) }, index=predictions.get("validation_index"), ) self.multiple_target = False self.target_columns = "target" else: # in case of Neural Networks and multi-class classification with one-hot encoding self.best_y_predicted[self.learner.uid] = pd.DataFrame( y_validation_true, index=predictions.get("validation_index")) self.multiple_target = True self.target_columns = y_validation_true.columns y_validation_predicted = predictions.get("y_validation_predicted") if len(y_validation_predicted.shape) == 1: # only one prediction column (binary classification or regression) self.best_y_predicted[self.learner.uid][ "prediction"] = np.array(y_validation_predicted) else: # several columns in multiclass classification cols = predictions.get("validation_columns") for i_col in range(y_validation_predicted.shape[1]): self.best_y_predicted[self.learner.uid][ # "prediction_{}".format(i_col) cols[i_col]] = y_validation_predicted[:, i_col] # store sample_weight sample_weight_validation = predictions.get( "sample_weight_validation") if sample_weight_validation is not None: self.best_y_predicted[self.learner.uid][ "sample_weight"] = np.array(sample_weight_validation) self.best_models[self.learner.uid] = self.learner.copy() # if local copy is not available, save model and keep path if self.best_models[self.learner.uid] is None: self.best_model_paths[self.learner.uid] = self.learner.save() else: self.no_improvement_cnt += 1 if self.no_improvement_cnt > self.max_no_improvement_cnt: self.learner.stop_training = True logger.info( "EarlyStopping.on_iteration_end, train loss: {}, validation loss: {}, " "no improvement cnt {}, iters {}".format( train_loss, validation_loss, self.no_improvement_cnt, len(self.loss_values[self.learner.uid]["iters"]), )) if self.log_to_dir is not None and self.learner.algorithm_short_name not in [ "Xgboost", "Random Forest", "Extra Trees", "LightGBM", "CatBoost", "Neural Network", ]: sign = -1.0 if Metric.optimize_negative(self.metric.name) else 1.0 with open( os.path.join(self.log_to_dir, f"{self.learner.name}_training.log"), "a") as fout: iteration = len(self.loss_values[self.learner.uid]["iters"]) fout.write( f"{iteration},{sign*train_loss},{sign*validation_loss}\n")