示例#1
0
    def fit(self, X, y, X_validation=None, y_validation=None, log_to_file=None):
        if self.cat_features is None:
            self.cat_features = []
            for i in range(X.shape[1]):
                if PreprocessingUtils.is_categorical(X.iloc[:, i]):
                    self.cat_features += [i]

        eval_set = None 
        if X_validation is not None and y_validation is not None:
            eval_set = (X_validation, y_validation)

        self.model.fit(
            X,
            y,
            cat_features=self.cat_features,
            init_model=None if self.model.tree_count_ is None else self.model,
            eval_set=eval_set,
            early_stopping_rounds=self.early_stopping_rounds,
            verbose_eval=False,
        )
        if log_to_file is not None:
            
            metric_name = list(self.model.evals_result_["learn"].keys())[0]
            result = pd.DataFrame(
                {
                    "iteration": range(len(self.model.evals_result_["learn"][metric_name])),
                    "train": self.model.evals_result_["learn"][metric_name],
                    "validation": self.model.evals_result_["validation"][metric_name],
                }
            )
            result.to_csv(log_to_file, index=False, header=False)
示例#2
0
    def compute(X, y, machinelearning_task):

        columns_info = {}
        for col in X.columns:
            columns_info[col] = []
            #
            empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
            if empty_column:
                columns_info[col] += ["empty_column"]
                continue
            #
            constant_column = len(np.unique(X.loc[~pd.isnull(X[col]),
                                                  col])) == 1
            if constant_column:
                columns_info[col] += ["constant_column"]
                continue
            #
            if PreprocessingUtils.is_na(X[col]):
                columns_info[col] += ["missing_values"]
            #
            if PreprocessingUtils.is_categorical(X[col]):
                columns_info[col] += ["categorical"]
                columns_info[col] += [EncodingSelector.get(X, y, col)]
            elif PreprocessingUtils.is_datetime(X[col]):
                columns_info[col] += ["datetime_transform"]
            elif PreprocessingUtils.is_text(X[col]):
                columns_info[col] = ["text_transform"
                                     ]  # override other transforms
            else:
                # numeric type, check if scale needed
                if PreprocessingUtils.is_scale_needed(X[col]):
                    columns_info[col] += ["scale"]

        target_info = []
        if machinelearning_task == BINARY_CLASSIFICATION:
            if not PreprocessingUtils.is_0_1(y):
                target_info += ["convert_0_1"]

        if machinelearning_task == REGRESSION:
            if PreprocessingUtils.is_log_scale_needed(y):
                target_info += ["scale_log"]
            elif PreprocessingUtils.is_scale_needed(y):
                target_info += ["scale"]

        num_class = None
        if machinelearning_task == MULTICLASS_CLASSIFICATION:
            num_class = PreprocessingUtils.num_class(y)

        return {
            "columns_info": columns_info,
            "target_info": target_info,
            "num_class": num_class,
        }
示例#3
0
    def fit(self, X, y):
        if self.cat_features is None:
            self.cat_features = []
            for i in range(X.shape[1]):
                if PreprocessingUtils.is_categorical(X.iloc[:, i]):
                    self.cat_features += [i]

        self.model.fit(
            X,
            y,
            cat_features=self.cat_features,
            init_model=None if self.model.tree_count_ is None else self.model,
        )
示例#4
0
    def fit(
        self,
        X,
        y,
        sample_weight=None,
        X_validation=None,
        y_validation=None,
        sample_weight_validation=None,
        log_to_file=None,
        max_time=None,
    ):
        if self.is_fitted():
            print("CatBoost model already fitted. Skip fit().")
            return

        if self.cat_features is None:
            self.cat_features = []
            for i in range(X.shape[1]):
                if PreprocessingUtils.is_categorical(X.iloc[:, i]):
                    self.cat_features += [i]

        eval_set = None
        if X_validation is not None and y_validation is not None:
            eval_set = Pool(
                data=X_validation,
                label=y_validation,
                cat_features=self.cat_features,
                weight=sample_weight_validation,
            )

        if self.params.get("num_boost_round") is None:
            model_init, new_iterations = self._assess_iterations(
                X, y, sample_weight, eval_set, max_time)
            self.model.set_params(iterations=new_iterations)
        else:
            model_init = None
            self.model.set_params(
                iterations=self.params.get("num_boost_round"))
            self.early_stopping_rounds = self.params.get(
                "early_stopping_rounds", 50)

        self.model.fit(
            X,
            y,
            sample_weight=sample_weight,
            cat_features=self.cat_features,
            init_model=model_init,
            eval_set=eval_set,
            early_stopping_rounds=self.early_stopping_rounds,
            verbose_eval=False,
        )

        if self.model.best_iteration_ is not None:
            if model_init is not None:
                self.best_ntree_limit = (self.model.best_iteration_ +
                                         model_init.tree_count_ + 1)
            else:
                self.best_ntree_limit = self.model.best_iteration_ + 1

        else:
            # just take all the trees
            # the warm-up trees are already included
            # dont need to add +1
            self.best_ntree_limit = self.model.tree_count_

        if log_to_file is not None:
            train_scores = self.model.evals_result_["learn"].get(
                self.log_metric_name)
            validation_scores = self.model.evals_result_["validation"].get(
                self.log_metric_name)
            if model_init is not None:
                if train_scores is not None:
                    train_scores = (model_init.evals_result_["learn"].get(
                        self.log_metric_name) + train_scores)
                if validation_scores is not None:
                    validation_scores = (
                        model_init.evals_result_["validation"].get(
                            self.log_metric_name) + validation_scores)
            iteration = None
            if train_scores is not None:
                iteration = range(len(validation_scores))
            elif validation_scores is not None:
                iteration = range(len(validation_scores))

            result = pd.DataFrame({
                "iteration": iteration,
                "train": train_scores,
                "validation": validation_scores,
            })
            result.to_csv(log_to_file, index=False, header=False)
示例#5
0
    def get(required_preprocessing, data, machinelearning_task):

        X = data["train"]["X"]
        y = data["train"]["y"]

        columns_preprocessing = {}
        for col in X.columns:
            preprocessing_to_apply = []

            # remove empty columns and columns with only one variable
            empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
            constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1
            if empty_column or constant_column:
                preprocessing_to_apply += ["remove_column"]
                columns_preprocessing[col] = preprocessing_to_apply
                continue

            # always check for missing values
            if (
                "missing_values_inputation" in required_preprocessing
                and PreprocessingUtils.is_na(X[col])
            ):
                preprocessing_to_apply += [PreprocessingMissingValues.FILL_NA_MEDIAN]
            # convert to categorical only for categorical types
            convert_to_integer_will_be_applied = False
            if (
                "convert_categorical" in required_preprocessing
                and PreprocessingUtils.is_categorical(X[col])
            ):
                preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
                convert_to_integer_will_be_applied = True

            if "scale" in required_preprocessing:
                if convert_to_integer_will_be_applied:
                    preprocessing_to_apply += [PreprocessingScale.SCALE_NORMAL]
                # elif PreprocessingUtils.is_log_scale_needed(X[col]):
                #    preprocessing_to_apply += [PreprocessingScale.SCALE_LOG_AND_NORMAL]
                elif PreprocessingUtils.is_scale_needed(X[col]):
                    preprocessing_to_apply += [PreprocessingScale.SCALE_NORMAL]

            # remeber which preprocessing we need to apply
            if preprocessing_to_apply:
                columns_preprocessing[col] = preprocessing_to_apply

        target_preprocessing = []
        # always remove missing values from target,
        # missing values might be in train and in validation datasets
        target_preprocessing += [PreprocessingMissingValues.NA_EXCLUDE]

        if machinelearning_task == BINARY_CLASSIFICATION:
            if not PreprocessingUtils.is_0_1(y):
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        if machinelearning_task == MULTICLASS_CLASSIFICATION:
            if PreprocessingUtils.is_categorical(y):
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        if machinelearning_task == REGRESSION:
            if PreprocessingUtils.is_log_scale_needed(y):
                target_preprocessing += [PreprocessingScale.SCALE_LOG_AND_NORMAL]
            elif PreprocessingUtils.is_scale_needed(y):
                target_preprocessing += [PreprocessingScale.SCALE_NORMAL]

        return {
            "columns_preprocessing": columns_preprocessing,
            "target_preprocessing": target_preprocessing,
        }
示例#6
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            X_validation=None,
            y_validation=None,
            sample_weight_validation=None,
            log_to_file=None,
            max_time=None):
        if self.model.tree_count_ is not None:
            print("CatBoost model already fitted. Skip fit().")
            return

        if self.cat_features is None:
            self.cat_features = []
            for i in range(X.shape[1]):
                if PreprocessingUtils.is_categorical(X.iloc[:, i]):
                    self.cat_features += [i]

        eval_set = None
        if X_validation is not None and y_validation is not None:
            eval_set = Pool(
                data=X_validation,
                label=y_validation,
                cat_features=self.cat_features,
                weight=sample_weight_validation,
            )

        # disable for now ...
        model_init, new_iterations = self._assess_iterations(
            X, y, eval_set, max_time)
        self.model.set_params(iterations=new_iterations)

        self.model.fit(X,
                       y,
                       sample_weight=sample_weight,
                       cat_features=self.cat_features,
                       init_model=model_init,
                       eval_set=eval_set,
                       early_stopping_rounds=self.early_stopping_rounds,
                       verbose_eval=False)
        if self.model.best_iteration_ is not None:
            self.best_ntree_limit = self.model.best_iteration_ + self.warmup_iterations + 1
        else:
            # just take all the trees
            # the warm-up trees are already included
            # dont need to add +1
            self.best_ntree_limit = self.model.tree_count_

        if log_to_file is not None:

            metric_name = list(self.model.evals_result_["learn"].keys())[0]
            train_scores = self.model.evals_result_["learn"][metric_name]
            validation_scores = self.model.evals_result_["validation"][
                metric_name]
            if model_init is not None:
                train_scores = model_init.evals_result_["learn"][
                    metric_name] + train_scores
                validation_scores = model_init.evals_result_["validation"][
                    metric_name] + validation_scores

            result = pd.DataFrame({
                "iteration": range(len(train_scores)),
                "train": train_scores,
                "validation": validation_scores,
            })
            result.to_csv(log_to_file, index=False, header=False)
示例#7
0
    def get(required_preprocessing, data, machinelearning_task):

        X = data["train"]["X"]
        y = data["train"]["y"]

        columns_preprocessing = {}
        for col in X.columns:
            preprocessing_to_apply = []

            # remove empty columns and columns with only one variable
            empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
            constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1
            if empty_column or constant_column:
                preprocessing_to_apply += ["remove_column"]
                columns_preprocessing[col] = preprocessing_to_apply
                continue

            # always check for missing values
            if (
                "missing_values_inputation" in required_preprocessing
                and PreprocessingUtils.is_na(X[col])
            ):
                preprocessing_to_apply += [PreprocessingMissingValues.FILL_NA_MEDIAN]
            # convert to categorical only for categorical types
            convert_to_integer_will_be_applied = False
            if (
                "convert_categorical" in required_preprocessing
                and PreprocessingUtils.is_categorical(X[col])
            ):
                preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
                convert_to_integer_will_be_applied = True

            if "scale" in required_preprocessing:
                if convert_to_integer_will_be_applied:
                    preprocessing_to_apply += [Scale.SCALE_NORMAL]
                # elif PreprocessingUtils.is_log_scale_needed(X[col]):
                #    preprocessing_to_apply += [Scale.SCALE_LOG_AND_NORMAL]
                elif PreprocessingUtils.is_scale_needed(X[col]):
                    preprocessing_to_apply += [Scale.SCALE_NORMAL]

            # remeber which preprocessing we need to apply
            if preprocessing_to_apply:
                columns_preprocessing[col] = preprocessing_to_apply

        target_preprocessing = []
        # always remove missing values from target,
        # target with missing values might be in the train and in the validation datasets
        target_preprocessing += [PreprocessingMissingValues.NA_EXCLUDE]

        if "target_as_integer" in required_preprocessing:
            if machinelearning_task == BINARY_CLASSIFICATION:
                if not PreprocessingUtils.is_0_1(y):
                    target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

            if machinelearning_task == MULTICLASS_CLASSIFICATION:
                # if PreprocessingUtils.is_categorical(y):
                # always convert to integer, there can be many situations that can break
                # for example, classes starting from 1, ...
                # or classes not for every number, for example 0,2,3,4
                # just always convert
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        elif "target_as_one_hot" in required_preprocessing:
            target_preprocessing += [PreprocessingCategorical.CONVERT_ONE_HOT]

        if (
            machinelearning_task == REGRESSION
            and "target_scale" in required_preprocessing
        ):
            if PreprocessingUtils.is_log_scale_needed(y):
                target_preprocessing += [Scale.SCALE_LOG_AND_NORMAL]
            elif PreprocessingUtils.is_scale_needed(y):
                target_preprocessing += [Scale.SCALE_NORMAL]

        """    
        if machinelearning_task == BINARY_CLASSIFICATION:
            if not PreprocessingUtils.is_0_1(y):
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        if machinelearning_task == MULTICLASS_CLASSIFICATION:
            if PreprocessingUtils.is_categorical(y):
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        """
        return {
            "columns_preprocessing": columns_preprocessing,
            "target_preprocessing": target_preprocessing,
            "ml_task": machinelearning_task,
        }
示例#8
0
    def optimize(
        self,
        algorithm,
        data_type,
        X_train,
        y_train,
        sample_weight,
        X_validation,
        y_validation,
        sample_weight_validation,
        learner_params,
    ):
        # only tune models with original data type
        if data_type != "original":
            return learner_params

        key = f"{data_type}_{algorithm}"
        if key in self.tuning:
            return self.update_learner_params(learner_params, self.tuning[key])

        if self.verbose:
            print(
                f"Optuna optimizes {algorithm} with time budget {self.time_budget} seconds "
                f"eval_metric {self.eval_metric.name} ({self.direction})")

        self.cat_features_indices = []
        for i in range(X_train.shape[1]):
            if PreprocessingUtils.is_categorical(X_train.iloc[:, i]):
                self.cat_features_indices += [i]

        study = optuna.create_study(
            direction=self.direction,
            sampler=optuna.samplers.TPESampler(seed=self.random_state),
            pruner=optuna.pruners.MedianPruner(
                n_warmup_steps=self.n_warmup_steps),
        )
        obejctive = None
        if algorithm == "LightGBM":
            objective = LightgbmObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.cat_features_indices,
                self.n_jobs,
                self.random_state,
            )
        elif algorithm == "Xgboost":
            objective = XgboostObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.n_jobs,
                self.random_state,
            )
        elif algorithm == "CatBoost":
            objective = CatBoostObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.cat_features_indices,
                self.n_jobs,
                self.random_state,
            )
        elif algorithm == "Random Forest":
            objective = RandomForestObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.n_jobs,
                self.random_state,
            )
        elif algorithm == "Extra Trees":
            objective = ExtraTreesObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.n_jobs,
                self.random_state,
            )
        elif algorithm == "Nearest Neighbors":
            objective = KNNObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.n_jobs,
                self.random_state,
            )
        elif algorithm == "Neural Network":
            objective = NeuralNetworkObjective(
                self.ml_task,
                X_train,
                y_train,
                sample_weight,
                X_validation,
                y_validation,
                sample_weight_validation,
                self.eval_metric,
                self.n_jobs,
                self.random_state,
            )

        study.optimize(objective, n_trials=5000, timeout=self.time_budget)

        self.plot_study(algorithm, data_type, study)

        joblib.dump(study, os.path.join(self.study_dir, key + ".joblib"))

        best = study.best_params

        if algorithm == "LightGBM":
            best["metric"] = objective.eval_metric_name
            best["custom_eval_metric_name"] = objective.custom_eval_metric_name
            best["num_boost_round"] = objective.rounds
            best["early_stopping_rounds"] = objective.early_stopping_rounds
            # best["learning_rate"] = objective.learning_rate
            best["cat_feature"] = self.cat_features_indices
            best["feature_pre_filter"] = False
            best["seed"] = objective.seed
        elif algorithm == "CatBoost":
            best["eval_metric"] = objective.eval_metric_name
            best["num_boost_round"] = objective.rounds
            best["early_stopping_rounds"] = objective.early_stopping_rounds
            # best["bootstrap_type"] = "Bernoulli"
            # best["learning_rate"] = objective.learning_rate
            best["seed"] = objective.seed
        elif algorithm == "Xgboost":
            best["objective"] = objective.objective
            best["eval_metric"] = objective.eval_metric_name
            # best["eta"] = objective.learning_rate
            best["max_rounds"] = objective.rounds
            best["early_stopping_rounds"] = objective.early_stopping_rounds
            best["seed"] = objective.seed
        elif algorithm == "Extra Trees":
            # Extra Trees are not using early stopping
            best["max_steps"] = objective.max_steps  # each step has 100 trees
            best["seed"] = objective.seed
            best["eval_metric_name"] = self.eval_metric.name
        elif algorithm == "Random Forest":
            # Random Forest is not using early stopping
            best["max_steps"] = objective.max_steps  # each step has 100 trees
            best["seed"] = objective.seed
            best["eval_metric_name"] = self.eval_metric.name
        elif algorithm == "Nearest Neighbors":
            best["rows_limit"] = 100000
        elif algorithm == "Neural Network":
            pass

        self.tuning[key] = best
        self.save()

        return self.update_learner_params(learner_params, best)