예제 #1
0
    def __call__(self, trial):
        try:
            params = {
                "n_neighbors": trial.suggest_int("n_neighbors", 1, 128),
                "weights": trial.suggest_categorical(
                    "weights", ["uniform", "distance"]
                ),
                "n_jobs": self.n_jobs,
                "rows_limit": 100000,
                "ml_task": self.ml_task,
            }
            Algorithm = (
                KNeighborsRegressorAlgorithm
                if self.ml_task == REGRESSION
                else KNeighborsAlgorithm
            )
            model = Algorithm(params)
            model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight)
            preds = model.predict(self.X_validation)

            score = self.eval_metric(self.y_validation, preds)
            if Metric.optimize_negative(self.eval_metric.name):
                score *= -1.0

        except optuna.exceptions.TrialPruned as e:
            raise e
        except Exception as e:
            print("Exception in KNNObjective", str(e))
            return None

        return score
예제 #2
0
    def plot_iterations(learner_names,
                        metric_name,
                        model_path,
                        colors,
                        trees_in_iteration=None):
        plt.figure(figsize=(10, 7))
        for ln in learner_names:
            df = pd.read_csv(
                os.path.join(model_path, f"{ln}_training.log"),
                names=["iteration", "train", "test"],
            )

            fold, repeat = learner_name_to_fold_repeat(ln)
            repeat_str = f" Reapeat {repeat+1}," if repeat is not None else ""
            # if trees_in_iteration is not None:
            #    df.iteration = df.iteration * trees_in_iteration
            plt.plot(
                df.iteration,
                df.train,
                "--",
                color=colors[fold],
                label=f"Fold {fold+1},{repeat_str} train",
            )
            any_none = np.sum(pd.isnull(df.test))
            if any_none == 0:
                plt.plot(
                    df.iteration,
                    df.test,
                    color=colors[fold],
                    label=f"Fold {fold+1},{repeat_str} test",
                )

            best_iter = None
            if Metric.optimize_negative(metric_name):
                best_iter = df.test.argmax()
            else:
                best_iter = df.test.argmin()

            if best_iter is not None and best_iter != -1:
                plt.axvline(best_iter, color=colors[fold], alpha=0.3)

        if trees_in_iteration is not None:
            plt.xlabel("#Trees")
        else:
            plt.xlabel("#Iteration")
        plt.ylabel(metric_name)

        # limit number of learners in the legend
        # too many will raise warnings
        if len(learner_names) <= 15:
            plt.legend(loc="best")

        plt.tight_layout(pad=2.0)
        plot_path = os.path.join(model_path, LearningCurves.output_file_name)
        plt.savefig(plot_path)
        plt.close("all")
예제 #3
0
    def __call__(self, trial):
        param = {
            "objective":
            self.objective,
            "eval_metric":
            self.eval_metric_name,
            "tree_method":
            "hist",
            "booster":
            "gbtree",
            "eta":
            trial.suggest_categorical("eta", [0.0125, 0.025, 0.05, 0.1]),
            "max_depth":
            trial.suggest_int("max_depth", 2, 12),
            "lambda":
            trial.suggest_float("lambda", EPS, 10.0, log=True),
            "alpha":
            trial.suggest_float("alpha", EPS, 10.0, log=True),
            "colsample_bytree":
            min(trial.suggest_float("colsample_bytree", 0.3, 1.0 + EPS), 1.0),
            "subsample":
            min(trial.suggest_float("subsample", 0.3, 1.0 + EPS), 1.0),
            "min_child_weight":
            trial.suggest_int("min_child_weight", 1, 100),
            "n_jobs":
            self.n_jobs,
            "seed":
            self.seed,
        }
        if self.num_class is not None:
            param["num_class"] = self.num_class
        try:
            pruning_callback = optuna.integration.XGBoostPruningCallback(
                trial, f"validation-{self.eval_metric_name}")
            bst = xgb.train(
                param,
                self.dtrain,
                self.rounds,
                evals=[(self.dvalidation, "validation")],
                early_stopping_rounds=self.early_stopping_rounds,
                callbacks=[pruning_callback],
                verbose_eval=False,
            )
            preds = bst.predict(self.dvalidation,
                                ntree_limit=bst.best_ntree_limit)
            score = self.eval_metric(self.y_validation, preds)
            if Metric.optimize_negative(self.eval_metric.name):
                score *= -1.0
        except optuna.exceptions.TrialPruned as e:
            raise e
        except Exception as e:
            print("Exception in XgboostObjective", str(e))
            return None

        return score
예제 #4
0
    def __init__(
        self,
        results_path,
        ml_task,
        eval_metric,
        time_budget=3600,
        init_params={},
        verbose=True,
        n_jobs=-1,
        random_state=42,
    ):
        if eval_metric.name not in [
                "auc",
                "logloss",
                "rmse",
                "mse",
                "mae",
                "mape",
                "r2",
                "spearman",
                "pearson",
                "f1",
                "average_precision",
                "accuracy",
                "user_defined_metric",
        ]:
            raise AutoMLException(
                f"Metric {eval_metric.name} is not supported")

        self.study_dir = os.path.join(results_path, "optuna")
        if not os.path.exists(self.study_dir):
            try:
                os.mkdir(self.study_dir)
            except Exception as e:
                print("Problem while creating directory for optuna studies.",
                      str(e))
        self.tuning_fname = os.path.join(self.study_dir, "optuna.json")
        self.tuning = init_params
        self.eval_metric = eval_metric

        self.direction = ("maximize" if Metric.optimize_negative(
            eval_metric.name) else "minimize")
        self.n_warmup_steps = (
            500  # set large enough to give small learning rates a chance
        )
        self.time_budget = time_budget
        self.verbose = verbose
        self.ml_task = ml_task
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.cat_features_indices = []
        self.load()
        if not self.verbose:
            optuna.logging.set_verbosity(optuna.logging.CRITICAL)
예제 #5
0
    def __init__(
        self,
        results_path,
        ml_task,
        eval_metric,
        time_budget=3600,
        init_params={},
        verbose=True,
        n_jobs=-1,
        random_state=42,
    ):
        if eval_metric.name not in ["auc", "logloss", "rmse", "mae", "mape"]:
            raise AutoMLException(
                f"Metric {eval_metric.name} is not supported")

        self.study_dir = os.path.join(results_path, "optuna")
        if not os.path.exists(self.study_dir):
            try:
                os.mkdir(self.study_dir)
            except Exception as e:
                print("Problem while creating directory for optuna studies.",
                      str(e))
        self.tuning_fname = os.path.join(self.study_dir, "optuna.json")
        self.tuning = init_params
        self.eval_metric = eval_metric

        self.direction = ("maximize" if Metric.optimize_negative(
            eval_metric.name) else "minimize")
        self.n_warmup_steps = 500  # set large enough to give small learning rates a chance
        self.time_budget = time_budget
        self.verbose = verbose
        self.ml_task = ml_task
        self.n_jobs = n_jobs
        self.random_state = random_state

        self.cat_features_indices = []
        data_info_fname = os.path.join(results_path, "data_info.json")
        if os.path.exists(data_info_fname):
            data_info = json.loads(open(data_info_fname).read())
            for i, (k, v) in enumerate(data_info["columns_info"].items()):
                if "categorical" in v:
                    self.cat_features_indices += [i]

        self.load()
        if not self.verbose:
            optuna.logging.set_verbosity(optuna.logging.CRITICAL)
예제 #6
0
    def compute(ldb, model_path, fout):
        if ldb.shape[0] < 2:
            return
        # Scatter plot
        plt.figure(figsize=(10, 7))
        plt.plot(ldb.metric_value, "*")
        plt.xlabel("#Iteration")
        plt.ylabel(ldb.metric_type.iloc[0])
        plt.title("AutoML Performance")
        plt.tight_layout(pad=2.0)
        plot_path = os.path.join(model_path,
                                 LeaderboardPlots.performance_fname)
        plt.savefig(plot_path)
        plt.close("all")

        fout.write("\n\n### AutoML Performance\n")
        fout.write(
            f"![AutoML Performance]({LeaderboardPlots.performance_fname})")

        # Boxplot
        by = "model_type"
        column = "metric_value"
        df2 = pd.DataFrame(
            {col: vals[column]
             for col, vals in ldb.groupby(by)})

        ascending_sort = Metric.optimize_negative(ldb.metric_type.iloc[0])
        mins = df2.min().sort_values(ascending=ascending_sort)

        plt.figure(figsize=(10, 7))
        # plt.title("")
        plt.ylabel(ldb.metric_type.iloc[0])
        df2[mins.index].boxplot(rot=90, fontsize=12)

        plt.tight_layout(pad=2.0)
        plot_path = os.path.join(model_path,
                                 LeaderboardPlots.performance_boxplot_fname)
        plt.savefig(plot_path)
        plt.close("all")

        fout.write("\n\n### AutoML Performance Boxplot\n")
        fout.write(
            f"![AutoML Performance Boxplot]({LeaderboardPlots.performance_boxplot_fname})"
        )
예제 #7
0
    def __call__(self, trial):
        try:
            Algorithm = (MLPRegressorAlgorithm
                         if self.ml_task == REGRESSION else MLPAlgorithm)
            params = {
                "dense_1_size":
                trial.suggest_int("dense_1_size", 4, 100),
                "dense_2_size":
                trial.suggest_int("dense_2_size", 2, 100),
                "learning_rate":
                trial.suggest_categorical("learning_rate",
                                          [0.005, 0.01, 0.05, 0.1, 0.2]),
                "learning_rate_type":
                trial.suggest_categorical("learning_rate_type",
                                          ["constant", "adaptive"]),
                "alpha":
                trial.suggest_float("alpha", 1e-8, 10.0, log=True),
                "seed":
                self.seed,
                "ml_task":
                self.ml_task,
            }
            model = Algorithm(params)
            model.fit(self.X_train,
                      self.y_train,
                      sample_weight=self.sample_weight)

            preds = model.predict(self.X_validation)

            score = self.eval_metric(self.y_validation, preds)
            if Metric.optimize_negative(self.eval_metric.name):
                score *= -1.0

        except optuna.exceptions.TrialPruned as e:
            raise e
        except Exception as e:
            print("Exception in NeuralNetworkObjective", str(e))
            return None

        return score
예제 #8
0
    def __call__(self, trial):
        try:
            Algorithm = (ExtraTreesRegressorAlgorithm if self.ml_task
                         == REGRESSION else ExtraTreesAlgorithm)
            self.objective = ("mse" if self.ml_task == REGRESSION else
                              trial.suggest_categorical(
                                  "criterion", ["gini", "entropy"]))
            params = {
                "max_steps": self.max_steps,
                "criterion": self.objective,
                "max_depth": trial.suggest_int("max_depth", 2, 32),
                "min_samples_split": trial.suggest_int("min_samples_split", 2,
                                                       100),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1,
                                                      100),
                "max_features": trial.suggest_float("max_features", 0.01, 1),
                "n_jobs": self.n_jobs,
                "seed": self.seed,
                "ml_task": self.ml_task,
            }
            model = Algorithm(params)

            model.fit(self.X_train,
                      self.y_train,
                      sample_weight=self.sample_weight)

            preds = model.predict(self.X_validation)

            score = self.eval_metric(self.y_validation, preds)
            if Metric.optimize_negative(self.eval_metric.name):
                score *= -1.0

        except optuna.exceptions.TrialPruned as e:
            raise e
        except Exception as e:
            print("Exception in ExtraTreesObjective", str(e))
            return None

        return score
예제 #9
0
    def __call__(self, trial):
        try:
            Algorithm = (ExtraTreesRegressor if self.ml_task == REGRESSION else
                         ExtraTreesClassifier)
            model = Algorithm(
                n_estimators=self.max_steps * 100,
                criterion=self.objective,
                max_depth=trial.suggest_int("max_depth", 2, 16),
                min_samples_split=trial.suggest_int("min_samples_split", 2,
                                                    100),
                min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 100),
                max_features=trial.suggest_float("max_features", 1e-8, 1),
                n_jobs=self.n_jobs,
                random_state=self.seed,
            )
            model.fit(self.X_train,
                      self.y_train,
                      sample_weight=self.sample_weight)

            if self.ml_task == BINARY_CLASSIFICATION:
                preds = model.predict_proba(self.X_validation)[:, 1]
            elif self.ml_task == MULTICLASS_CLASSIFICATION:
                preds = model.predict_proba(self.X_validation)
            else:  # REGRESSION
                preds = model.predict(self.X_validation)

            score = self.eval_metric(self.y_validation, preds)
            if Metric.optimize_negative(self.eval_metric.name):
                score *= -1.0

        except optuna.exceptions.TrialPruned as e:
            raise e
        except Exception as e:
            print("Exception in ExtraTreesObjective", str(e))
            return None

        return score
예제 #10
0
    def __call__(self, trial):
        param = {
            "objective":
            self.objective,
            "metric":
            self.eval_metric_name,
            "verbosity":
            -1,
            "boosting_type":
            "gbdt",
            "learning_rate":
            trial.suggest_categorical("learning_rate",
                                      [0.0125, 0.025, 0.05, 0.1]),
            "num_leaves":
            trial.suggest_int("num_leaves", 2, 2048),
            "lambda_l1":
            trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
            "lambda_l2":
            trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
            "feature_fraction":
            min(trial.suggest_float("feature_fraction", 0.3, 1.0 + EPS), 1.0),
            "bagging_fraction":
            min(trial.suggest_float("bagging_fraction", 0.3, 1.0 + EPS), 1.0),
            "bagging_freq":
            trial.suggest_int("bagging_freq", 1, 7),
            "min_data_in_leaf":
            trial.suggest_int("min_data_in_leaf", 1, 100),
            "feature_pre_filter":
            False,
            "seed":
            self.seed,
            "num_threads":
            self.n_jobs,
            "extra_trees":
            trial.suggest_categorical("extra_trees", [True, False]),
        }

        if self.cat_features_indices:
            param["cat_feature"] = self.cat_features_indices
            param["cat_l2"] = trial.suggest_float("cat_l2", EPS, 100.0)
            param["cat_smooth"] = trial.suggest_float("cat_smooth", EPS, 100.0)

        if self.num_class is not None:
            param["num_class"] = self.num_class

        try:

            metric_name = self.eval_metric_name
            if metric_name == "custom":
                metric_name = self.custom_eval_metric_name
            pruning_callback = optuna.integration.LightGBMPruningCallback(
                trial, metric_name, "validation")

            gbm = lgb.train(
                param,
                self.dtrain,
                valid_sets=[self.dvalid],
                valid_names=["validation"],
                verbose_eval=False,
                callbacks=[pruning_callback],
                num_boost_round=self.rounds,
                early_stopping_rounds=self.early_stopping_rounds,
                feval=self.custom_eval_metric,
            )

            preds = gbm.predict(self.X_validation)
            score = self.eval_metric(self.y_validation, preds)
            if Metric.optimize_negative(self.eval_metric.name):
                score *= -1.0
        except optuna.exceptions.TrialPruned as e:
            raise e
        #except Exception as e:
        #    print("Exception in LightgbmObjective", str(e))
        #    return None

        return score
예제 #11
0
    def fit(self, oofs, y, sample_weight=None):
        logger.debug("Ensemble.fit")
        start_time = time.time()
        selected_algs_cnt = 0  # number of selected algorithms
        self.best_algs = []  # selected algoritms indices from each loop

        total_prediction_time = 0
        best_sum = None  # sum of best algorihtms
        for j in range(len(oofs)):  # iterate over all solutions
            min_score = self.metric.get_maximum()
            best_model = None
            # try to add some algorithm to the best_sum to minimize metric
            for model_name in oofs.keys():
                if (self._max_single_prediction_time
                        and model_name in self.model_prediction_time):
                    if (total_prediction_time +
                            self.model_prediction_time[model_name] >
                            self._max_single_prediction_time):
                        continue
                y_ens = self._get_mean(oofs[model_name], best_sum, j + 1)
                score = self.metric(y, y_ens, sample_weight)
                if self.metric.improvement(previous=min_score, current=score):
                    min_score = score
                    best_model = model_name

            if best_model is None:
                continue
            # there is improvement, save it
            # save scores for plotting learning curve
            # if we optimize negative, then we need to multiply by -1.0
            # to save correct values in the learning curve
            sign = -1.0 if Metric.optimize_negative(self.metric.name) else 1.0
            self._scores += [sign * min_score]

            if self.metric.improvement(previous=self.best_loss,
                                       current=min_score):
                self.best_loss = min_score
                selected_algs_cnt = j

            self.best_algs.append(best_model)  # save the best algoritm
            # update best_sum value
            best_sum = (oofs[best_model] if best_sum is None else best_sum +
                        oofs[best_model])
            if j == selected_algs_cnt:
                self.total_best_sum = copy.deepcopy(best_sum)

            # update prediction time estimate
            if self._max_single_prediction_time is not None:
                total_prediction_time = np.sum([
                    self.model_prediction_time[name]
                    for name in np.unique(self.best_algs)
                ])
        # end of main loop #

        if not self.best_algs:
            raise NotTrainedException("Ensemble wasn't fitted.")

        # keep oof predictions of ensemble
        self.total_best_sum /= float(selected_algs_cnt + 1)
        self.best_algs = self.best_algs[:(selected_algs_cnt + 1)]

        logger.debug("Selected models for ensemble:")
        for model_name in np.unique(self.best_algs):
            self.selected_models += [{
                "model":
                self.models_map[model_name],
                "repeat":
                float(self.best_algs.count(model_name)),
            }]
            logger.debug(f"{model_name} {self.best_algs.count(model_name)}")

        self._additional_metrics = self.get_additional_metrics()

        self.train_time = time.time() - start_time
예제 #12
0
    def __call__(self, trial):
        try:
            params = {
                "iterations":
                self.rounds,
                "learning_rate":
                trial.suggest_categorical("learning_rate", [0.05, 0.1, 0.2]),
                "depth":
                trial.suggest_int("depth", 2, 9),
                "l2_leaf_reg":
                trial.suggest_float("l2_leaf_reg", 0.0001, 10.0, log=False),
                "random_strength":
                trial.suggest_float("random_strength", EPS, 10.0, log=False),
                "rsm":
                trial.suggest_float("rsm", 0.1, 1),  # colsample_bylevel=rsm
                "loss_function":
                self.objective,
                "eval_metric":
                self.eval_metric_name,
                "verbose":
                False,
                "allow_writing_files":
                False,
                "thread_count":
                self.n_jobs,
                "random_seed":
                self.seed,
                # "border_count": trial.suggest_int("border_count", 16, 2048),
                "min_data_in_leaf":
                trial.suggest_int("min_data_in_leaf", 1, 100),
                # "bootstrap_type": "Bernoulli"
                # trial.suggest_categorical(
                #    "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
                # ),
            }
            # if params["bootstrap_type"] == "Bayesian":
            #    params["bagging_temperature"] = trial.suggest_float(
            #        "bagging_temperature", 0, 10
            #    )
            # elif params["bootstrap_type"] in ["Bernoulli", "MVS"]:
            # params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

            Algorithm = (CatBoostRegressor
                         if self.ml_task == REGRESSION else CatBoostClassifier)
            if self.custom_eval_metric is not None:
                params["eval_metric"] = self.custom_eval_metric
            model = Algorithm(**params)

            model.fit(
                self.X_train,
                self.y_train,
                sample_weight=self.sample_weight,
                early_stopping_rounds=self.early_stopping_rounds,
                eval_set=self.eval_set,
                verbose_eval=False,
                cat_features=self.cat_features,
            )

            if self.ml_task == BINARY_CLASSIFICATION:
                preds = model.predict_proba(self.X_validation,
                                            ntree_end=model.best_iteration_ +
                                            1)[:, 1]
            elif self.ml_task == MULTICLASS_CLASSIFICATION:
                preds = model.predict_proba(self.X_validation,
                                            ntree_end=model.best_iteration_ +
                                            1)
            else:  # REGRESSION
                preds = model.predict(self.X_validation,
                                      ntree_end=model.best_iteration_ + 1)

            score = self.eval_metric(self.y_validation, preds)
            if Metric.optimize_negative(self.eval_metric.name):
                score *= -1.0

        except optuna.exceptions.TrialPruned as e:
            raise e
        except Exception as e:
            print("Exception in CatBoostObjective", str(e))
            return None

        return score
예제 #13
0
    def on_iteration_end(self, logs, predictions):
        train_loss = 0
        if predictions.get("y_train_predicted") is not None:
            train_loss = self.metric(
                predictions.get("y_train_true"),
                predictions.get("y_train_predicted"),
                predictions.get("sample_weight"),
            )

        validation_loss = self.metric(
            predictions.get("y_validation_true"),
            predictions.get("y_validation_predicted"),
            predictions.get("sample_weight_validation"),
        )
        self.loss_values[self.learner.uid]["train"] += [train_loss]
        self.loss_values[self.learner.uid]["validation"] += [validation_loss]
        self.loss_values[self.learner.uid]["iters"] += [logs.get("iter_cnt")]

        if self.metric.improvement(previous=self.best_loss[self.learner.uid],
                                   current=validation_loss):

            y_validation_true = predictions.get("y_validation_true")
            self.no_improvement_cnt = 0
            self.best_iter[self.learner.uid] = logs.get("iter_cnt")
            self.best_loss[self.learner.uid] = validation_loss

            if len(y_validation_true.shape
                   ) == 1 or y_validation_true.shape[1] == 1:
                self.best_y_predicted[self.learner.uid] = pd.DataFrame(
                    {
                        "target": np.array(y_validation_true)
                        # y_validation_true.values.reshape(
                        #    y_validation_true.shape[0]
                        # )
                    },
                    index=predictions.get("validation_index"),
                )
                self.multiple_target = False
                self.target_columns = "target"
            else:
                # in case of Neural Networks and multi-class classification with one-hot encoding
                self.best_y_predicted[self.learner.uid] = pd.DataFrame(
                    y_validation_true,
                    index=predictions.get("validation_index"))
                self.multiple_target = True
                self.target_columns = y_validation_true.columns

            y_validation_predicted = predictions.get("y_validation_predicted")

            if len(y_validation_predicted.shape) == 1:
                # only one prediction column (binary classification or regression)
                self.best_y_predicted[self.learner.uid][
                    "prediction"] = np.array(y_validation_predicted)
            else:
                # several columns in multiclass classification
                cols = predictions.get("validation_columns")
                for i_col in range(y_validation_predicted.shape[1]):
                    self.best_y_predicted[self.learner.uid][
                        # "prediction_{}".format(i_col)
                        cols[i_col]] = y_validation_predicted[:, i_col]

            # store sample_weight
            sample_weight_validation = predictions.get(
                "sample_weight_validation")
            if sample_weight_validation is not None:
                self.best_y_predicted[self.learner.uid][
                    "sample_weight"] = np.array(sample_weight_validation)

            self.best_models[self.learner.uid] = self.learner.copy()
            # if local copy is not available, save model and keep path
            if self.best_models[self.learner.uid] is None:
                self.best_model_paths[self.learner.uid] = self.learner.save()
        else:
            self.no_improvement_cnt += 1

        if self.no_improvement_cnt > self.max_no_improvement_cnt:
            self.learner.stop_training = True

        logger.info(
            "EarlyStopping.on_iteration_end, train loss: {}, validation loss: {}, "
            "no improvement cnt {}, iters {}".format(
                train_loss,
                validation_loss,
                self.no_improvement_cnt,
                len(self.loss_values[self.learner.uid]["iters"]),
            ))

        if self.log_to_dir is not None and self.learner.algorithm_short_name not in [
                "Xgboost",
                "Random Forest",
                "Extra Trees",
                "LightGBM",
                "CatBoost",
                "Neural Network",
        ]:
            sign = -1.0 if Metric.optimize_negative(self.metric.name) else 1.0
            with open(
                    os.path.join(self.log_to_dir,
                                 f"{self.learner.name}_training.log"),
                    "a") as fout:
                iteration = len(self.loss_values[self.learner.uid]["iters"])
                fout.write(
                    f"{iteration},{sign*train_loss},{sign*validation_loss}\n")