示例#1
0
    def fit(self, X_train, y_train):
        """
        Function to initialize a ElasticNet model using (X, y).

        Parameters
        ----------
        X_train: numpy.array or pandas.DataFrame
            Training features data

        y_train: numpy.array[int] or list[int]
            List of training ground truth binary values [0, 1]
        """
        # preprocessing X, y
        self.X_train_, self.y_train_ = self._dtrain(X_train, y_train)

        # initialize model
        self.model_ = self._model()

        # train model
        if self.sparse_matrix:
            self.model_.fit(
                df_to_csr(self.X_train_, fillna=0.0, verbose=False),
                self.y_train_)
        else:
            self.model_.fit(self.X_train_, self.y_train_)

        # prep attributes
        self._prep_attributes()

        return None
    def _dtrain(self, X_train, y_train):
        """
        Function to return dtrain matrix based on
        input parameters including sparse_matrix,
        and scaled using both numpy array and pandas
        DataFrame.
        Parameters
        ----------
        X_train: numpy.array or Pandas DataFrame
            Training features data
        y_train: numpy.array[int] or list[int]
            List of training ground truth binary values [0, 1]
        """
        if isinstance(X_train, np.ndarray):
            self.X_train = pd.DataFrame(
                X_train, columns=[f"F_{i}" for i in range(X_train.shape[1])])
        elif isinstance(X_train, pd.DataFrame):
            self.X_train = X_train
        else:
            raise TypeError(
                "The input X_train must be numpy array or pandas DataFrame.")

        if isinstance(y_train, np.ndarray) or isinstance(y_train, list):
            self.y_train = y_train
        else:
            raise TypeError("The input y_train must be numpy array or list.")
        self.y_train = y_train

        if self.sparse_matrix and self.scale_mean:
            raise ValueError(
                "The scale_mean should be False in conjuction of using sparse_matrix=True."
            )

        if self.scale_mean or self.scale_std:
            self.scaler_ = StandardScaler(with_mean=self.scale_mean,
                                          with_std=self.scale_std)
            self.X_train_ = pd.DataFrame(
                self.scaler_.fit_transform(self.X_train),
                columns=self.X_train.columns.tolist(),
            )
        else:
            self.X_train_ = self.X_train.copy()

        if not self.sparse_matrix:
            dtrain = xgb.DMatrix(data=self.X_train_, label=self.y_train)
        else:
            dtrain = xgb.DMatrix(
                data=df_to_csr(self.X_train_, fillna=0.0, verbose=False),
                label=self.y_train,
                feature_names=self.X_train_.columns.tolist(),
            )

        return dtrain
    def _dtest(self, X_test, y_test):
        """
        Functio to return dtest matrix based on
        input X_test, y_test including sparse_matrix,
        and scaled using both numpy array and pandas
        DataFrame. It does apply scaler transformation
        in case it was used.
        Parameters
        ----------
        X_test: numpy.array or Pandas DataFrame
            Testing/validation features data
        y_test: numpy.array[int] or list[int]
            List of testing/validation ground truth binary values [0, 1]
        """
        if isinstance(X_test, np.ndarray):
            self.X_test = pd.DataFrame(
                X_test, columns=[f"F_{i}" for i in range(X_test.shape[1])])
        elif isinstance(X_test, pd.DataFrame):
            self.X_test = X_test
        else:
            raise TypeError(
                "The input X_test must be numpy array or pandas DataFrame.")

        if isinstance(y_test, np.ndarray) or isinstance(y_test, list):
            self.y_test = y_test
        else:
            raise TypeError("The input y_test must be numpy array or list.")
        self.y_test = y_test

        if self.scale_mean or self.scale_std:
            self.X_test_ = pd.DataFrame(
                self.scaler_.transform(self.X_test),
                columns=self.X_test.columns.tolist(),
            )
        else:
            self.X_test_ = self.X_test.copy()

        if not self.sparse_matrix:
            dtest = xgb.DMatrix(data=self.X_test_, label=self.y_test)
        else:
            dtest = xgb.DMatrix(
                data=df_to_csr(self.X_test_, fillna=0.0, verbose=False),
                label=self.y_test,
                feature_names=self.X_test_.columns.tolist(),
            )

        return dtest
    def fit(self, X, y):
        """
        Function to fit the main feature selection algorith,
        and run the selection process.
        Parameters
        ----------
        X: numpy.array or Pandas DataFrame
            Features data
        y: numpy.array[int] or list[int]
            List of ground truth binary values [0, 1]
        """

        if isinstance(X, np.ndarray):
            self.X = pd.DataFrame(
                X, columns=[f"F_{i}" for i in range(X.shape[1])])
        elif isinstance(X, pd.DataFrame):
            self.X = X
        else:
            raise TypeError(
                "The input X must be numpy array or pandas DataFrame.")

        if isinstance(y, np.ndarray) or isinstance(y, list):
            self.y = y
        else:
            raise TypeError("The input y must be numpy array or list.")
        self.y = y

        # final results dict + list
        self.cv_results_ = {}
        self.cv_results_["int_cv_train"] = []
        self.cv_results_["int_cv_test"] = []
        self.cv_results_["ext_cv_train"] = []
        self.cv_results_["ext_cv_test"] = []
        self.pruned_features = []
        self.feature_importance_ = {}

        # main loop
        for iteration in range(self.n_iter):
            print(Color.BOLD + "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* " +
                  Color.B_Green + f"Iteration {iteration + 1}" + Color.END +
                  Color.BOLD + " *-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*")

            # results at each iteration
            int_cv_train2 = []
            int_cv_test2 = []
            ext_cv_train2 = []
            ext_cv_test2 = []

            # update random state
            self.random_state_ = self.random_state * iteration

            # adding noise to data
            X_permuted = noisy_features(X=self.X,
                                        random_state=self.random_state_)
            cols = X_permuted.columns.tolist()
            Xval = X_permuted.values

            # building DMatrix for training/testing + kfolds cv
            cv = StratifiedKFold(
                n_splits=self.n_splits,
                shuffle=self.shuffle,
                random_state=self.random_state_,
            )

            # set a counter for nfolds cv
            ijk = 1
            for train_index, test_index in cv.split(Xval, self.y):
                X_train = pd.DataFrame(data=Xval[train_index], columns=cols)
                X_test = pd.DataFrame(data=Xval[test_index], columns=cols)
                Y_train = self.y[train_index]
                Y_test = self.y[test_index]

                if not self.sparse_matrix:
                    self.dtrain = xgb.DMatrix(data=X_train, label=Y_train)
                    self.dtest = xgb.DMatrix(data=X_test, label=Y_test)
                else:
                    self.dtrain = xgb.DMatrix(
                        data=df_to_csr(X_train, fillna=0.0, verbose=False),
                        label=Y_train,
                        feature_names=X_train.columns.tolist(),
                    )
                    self.dtest = xgb.DMatrix(
                        data=df_to_csr(X_test, fillna=0.0, verbose=False),
                        label=Y_test,
                        feature_names=X_test.columns.tolist(),
                    )

                # watchlist during final training
                self.watchlist = [(self.dtrain, "train"), (self.dtest, "eval")]

                # dict to store training results
                self.evals_result = {}

                # calling xgb cv
                self.cvr = self._cv()

                # appending cv results
                self.cv_results_["int_cv_train"] += [self.cvr.iloc[-1][0]]
                self.cv_results_["int_cv_test"] += [self.cvr.iloc[-1][2]]

                # appending temp cv results
                int_cv_train2.append(self.cvr.iloc[-1][0])
                int_cv_test2.append(self.cvr.iloc[-1][2])

                # xgb train best model
                bst = self._bst()

                # feature gain
                feature_gain = self._xgb_imp_to_df(bst)
                self.feature_importance_[
                    f"bst_iter{iteration+1}_fold{ijk}"] = feature_gain

                # check wheather noisy feature is selected
                if feature_gain["feature"].str.contains("noisy").sum() != 0:
                    gain_threshold = feature_gain.loc[
                        feature_gain["feature"].str.contains("noisy"),
                        self.importance_type, ].values.tolist()[
                            self.nth_noise_threshold - 1]
                else:
                    gain_threshold = 0.0

                # subsetting features for > gain_threshold
                gain_subset = feature_gain.loc[
                    feature_gain[self.importance_type] > gain_threshold,
                    "feature"].values.tolist()
                for c in gain_subset:
                    self.pruned_features.append(c)

                # appending final eval results
                self.cv_results_["ext_cv_train"] += [
                    self.evals_result["train"][self.params["eval_metric"]][-1]
                ]
                self.cv_results_["ext_cv_test"] += [
                    self.evals_result["eval"][self.params["eval_metric"]][-1]
                ]

                # appending temp eval results
                ext_cv_train2.append(
                    self.evals_result["train"][self.params["eval_metric"]][-1])
                ext_cv_test2.append(
                    self.evals_result["eval"][self.params["eval_metric"]][-1])

                print(
                    Color.BOLD + "*-*-*-*-*-*-*-*-*-*-*-* " + Color.F_Green +
                    f"Fold = {ijk}/{self.n_splits}" + Color.F_Black + " -- " +
                    Color.F_Red +
                    f"Train {self.params['eval_metric'].upper()}" + " = " +
                    f"{self.evals_result['train'][self.params['eval_metric']][-1]:.3f}"
                    + Color.F_Black + " -- " + Color.F_Blue +
                    f"Test {self.params['eval_metric'].upper()}" + " = " +
                    f"{self.evals_result['eval'][self.params['eval_metric']][-1]:.3f}"
                    + Color.END + Color.BOLD + " *-*-*-*-*-*-*-*-*-*-*-*")
                # free memory here at each fold
                del (
                    gain_subset,
                    feature_gain,
                    bst,
                    self.watchlist,
                    Y_train,
                    Y_test,
                    self.cvr,
                    self.evals_result,
                    X_train,
                    X_test,
                    self.dtrain,
                    self.dtest,
                )

                ijk += 1
                gc.collect()

            # print internal metrics results
            print(Color.BOLD + "*-*-* " + Color.GREEN +
                  f"Internal {self.n_splits}-Folds CV:" + Color.END +
                  Color.BOLD + " -*-*- " + Color.F_Red +
                  f"Train {self.metrics.upper()}" + " = " +
                  f"{np.mean(int_cv_train2):.3f}" + " +/- " +
                  f"{np.std(int_cv_train2):.3f}" + Color.END + Color.BOLD +
                  " -*-*- " + Color.F_Blue + f"Test {self.metrics.upper()}" +
                  " = " + f"{np.mean(int_cv_test2):.3f}" + " +/- " +
                  f"{np.std(int_cv_test2):.3f}" + Color.END + Color.BOLD +
                  " *-*-*")

            #  print external eval_metric results
            print(Color.BOLD + "*-*-* " + Color.GREEN +
                  f"External {self.n_splits}-Folds CV:" + Color.END +
                  Color.BOLD + " -*-*- " + Color.F_Red +
                  f"Train {self.params['eval_metric'].upper()}" + " = " +
                  f"{np.mean(ext_cv_train2):.3f}" + " +/- " +
                  f"{np.std(ext_cv_train2):.3f}" + Color.END + Color.BOLD +
                  " -*-*- " + Color.F_Blue +
                  f"Test {self.params['eval_metric'].upper()}" + " = " +
                  f"{np.mean(ext_cv_test2):.3f}" + " +/- " +
                  f"{np.std(ext_cv_test2):.3f}" + Color.END + Color.BOLD +
                  " *-*-*\n")

            # free memory here at iteration
            del (
                int_cv_train2,
                int_cv_test2,
                ext_cv_train2,
                ext_cv_test2,
                X_permuted,
                cols,
                Xval,
                cv,
            )
            gc.collect()

        # calling function to get plotting cv results attribute
        self.plotting_cv_ = self.get_plotting_cv()

        # pruned features freq
        self.feature_frequency_ = self._freq()

        return None