def fit(self, X_train, y_train): """ Function to initialize a ElasticNet model using (X, y). Parameters ---------- X_train: numpy.array or pandas.DataFrame Training features data y_train: numpy.array[int] or list[int] List of training ground truth binary values [0, 1] """ # preprocessing X, y self.X_train_, self.y_train_ = self._dtrain(X_train, y_train) # initialize model self.model_ = self._model() # train model if self.sparse_matrix: self.model_.fit( df_to_csr(self.X_train_, fillna=0.0, verbose=False), self.y_train_) else: self.model_.fit(self.X_train_, self.y_train_) # prep attributes self._prep_attributes() return None
def _dtrain(self, X_train, y_train): """ Function to return dtrain matrix based on input parameters including sparse_matrix, and scaled using both numpy array and pandas DataFrame. Parameters ---------- X_train: numpy.array or Pandas DataFrame Training features data y_train: numpy.array[int] or list[int] List of training ground truth binary values [0, 1] """ if isinstance(X_train, np.ndarray): self.X_train = pd.DataFrame( X_train, columns=[f"F_{i}" for i in range(X_train.shape[1])]) elif isinstance(X_train, pd.DataFrame): self.X_train = X_train else: raise TypeError( "The input X_train must be numpy array or pandas DataFrame.") if isinstance(y_train, np.ndarray) or isinstance(y_train, list): self.y_train = y_train else: raise TypeError("The input y_train must be numpy array or list.") self.y_train = y_train if self.sparse_matrix and self.scale_mean: raise ValueError( "The scale_mean should be False in conjuction of using sparse_matrix=True." ) if self.scale_mean or self.scale_std: self.scaler_ = StandardScaler(with_mean=self.scale_mean, with_std=self.scale_std) self.X_train_ = pd.DataFrame( self.scaler_.fit_transform(self.X_train), columns=self.X_train.columns.tolist(), ) else: self.X_train_ = self.X_train.copy() if not self.sparse_matrix: dtrain = xgb.DMatrix(data=self.X_train_, label=self.y_train) else: dtrain = xgb.DMatrix( data=df_to_csr(self.X_train_, fillna=0.0, verbose=False), label=self.y_train, feature_names=self.X_train_.columns.tolist(), ) return dtrain
def _dtest(self, X_test, y_test): """ Functio to return dtest matrix based on input X_test, y_test including sparse_matrix, and scaled using both numpy array and pandas DataFrame. It does apply scaler transformation in case it was used. Parameters ---------- X_test: numpy.array or Pandas DataFrame Testing/validation features data y_test: numpy.array[int] or list[int] List of testing/validation ground truth binary values [0, 1] """ if isinstance(X_test, np.ndarray): self.X_test = pd.DataFrame( X_test, columns=[f"F_{i}" for i in range(X_test.shape[1])]) elif isinstance(X_test, pd.DataFrame): self.X_test = X_test else: raise TypeError( "The input X_test must be numpy array or pandas DataFrame.") if isinstance(y_test, np.ndarray) or isinstance(y_test, list): self.y_test = y_test else: raise TypeError("The input y_test must be numpy array or list.") self.y_test = y_test if self.scale_mean or self.scale_std: self.X_test_ = pd.DataFrame( self.scaler_.transform(self.X_test), columns=self.X_test.columns.tolist(), ) else: self.X_test_ = self.X_test.copy() if not self.sparse_matrix: dtest = xgb.DMatrix(data=self.X_test_, label=self.y_test) else: dtest = xgb.DMatrix( data=df_to_csr(self.X_test_, fillna=0.0, verbose=False), label=self.y_test, feature_names=self.X_test_.columns.tolist(), ) return dtest
def fit(self, X, y): """ Function to fit the main feature selection algorith, and run the selection process. Parameters ---------- X: numpy.array or Pandas DataFrame Features data y: numpy.array[int] or list[int] List of ground truth binary values [0, 1] """ if isinstance(X, np.ndarray): self.X = pd.DataFrame( X, columns=[f"F_{i}" for i in range(X.shape[1])]) elif isinstance(X, pd.DataFrame): self.X = X else: raise TypeError( "The input X must be numpy array or pandas DataFrame.") if isinstance(y, np.ndarray) or isinstance(y, list): self.y = y else: raise TypeError("The input y must be numpy array or list.") self.y = y # final results dict + list self.cv_results_ = {} self.cv_results_["int_cv_train"] = [] self.cv_results_["int_cv_test"] = [] self.cv_results_["ext_cv_train"] = [] self.cv_results_["ext_cv_test"] = [] self.pruned_features = [] self.feature_importance_ = {} # main loop for iteration in range(self.n_iter): print(Color.BOLD + "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* " + Color.B_Green + f"Iteration {iteration + 1}" + Color.END + Color.BOLD + " *-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*") # results at each iteration int_cv_train2 = [] int_cv_test2 = [] ext_cv_train2 = [] ext_cv_test2 = [] # update random state self.random_state_ = self.random_state * iteration # adding noise to data X_permuted = noisy_features(X=self.X, random_state=self.random_state_) cols = X_permuted.columns.tolist() Xval = X_permuted.values # building DMatrix for training/testing + kfolds cv cv = StratifiedKFold( n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state_, ) # set a counter for nfolds cv ijk = 1 for train_index, test_index in cv.split(Xval, self.y): X_train = pd.DataFrame(data=Xval[train_index], columns=cols) X_test = pd.DataFrame(data=Xval[test_index], columns=cols) Y_train = self.y[train_index] Y_test = self.y[test_index] if not self.sparse_matrix: self.dtrain = xgb.DMatrix(data=X_train, label=Y_train) self.dtest = xgb.DMatrix(data=X_test, label=Y_test) else: self.dtrain = xgb.DMatrix( data=df_to_csr(X_train, fillna=0.0, verbose=False), label=Y_train, feature_names=X_train.columns.tolist(), ) self.dtest = xgb.DMatrix( data=df_to_csr(X_test, fillna=0.0, verbose=False), label=Y_test, feature_names=X_test.columns.tolist(), ) # watchlist during final training self.watchlist = [(self.dtrain, "train"), (self.dtest, "eval")] # dict to store training results self.evals_result = {} # calling xgb cv self.cvr = self._cv() # appending cv results self.cv_results_["int_cv_train"] += [self.cvr.iloc[-1][0]] self.cv_results_["int_cv_test"] += [self.cvr.iloc[-1][2]] # appending temp cv results int_cv_train2.append(self.cvr.iloc[-1][0]) int_cv_test2.append(self.cvr.iloc[-1][2]) # xgb train best model bst = self._bst() # feature gain feature_gain = self._xgb_imp_to_df(bst) self.feature_importance_[ f"bst_iter{iteration+1}_fold{ijk}"] = feature_gain # check wheather noisy feature is selected if feature_gain["feature"].str.contains("noisy").sum() != 0: gain_threshold = feature_gain.loc[ feature_gain["feature"].str.contains("noisy"), self.importance_type, ].values.tolist()[ self.nth_noise_threshold - 1] else: gain_threshold = 0.0 # subsetting features for > gain_threshold gain_subset = feature_gain.loc[ feature_gain[self.importance_type] > gain_threshold, "feature"].values.tolist() for c in gain_subset: self.pruned_features.append(c) # appending final eval results self.cv_results_["ext_cv_train"] += [ self.evals_result["train"][self.params["eval_metric"]][-1] ] self.cv_results_["ext_cv_test"] += [ self.evals_result["eval"][self.params["eval_metric"]][-1] ] # appending temp eval results ext_cv_train2.append( self.evals_result["train"][self.params["eval_metric"]][-1]) ext_cv_test2.append( self.evals_result["eval"][self.params["eval_metric"]][-1]) print( Color.BOLD + "*-*-*-*-*-*-*-*-*-*-*-* " + Color.F_Green + f"Fold = {ijk}/{self.n_splits}" + Color.F_Black + " -- " + Color.F_Red + f"Train {self.params['eval_metric'].upper()}" + " = " + f"{self.evals_result['train'][self.params['eval_metric']][-1]:.3f}" + Color.F_Black + " -- " + Color.F_Blue + f"Test {self.params['eval_metric'].upper()}" + " = " + f"{self.evals_result['eval'][self.params['eval_metric']][-1]:.3f}" + Color.END + Color.BOLD + " *-*-*-*-*-*-*-*-*-*-*-*") # free memory here at each fold del ( gain_subset, feature_gain, bst, self.watchlist, Y_train, Y_test, self.cvr, self.evals_result, X_train, X_test, self.dtrain, self.dtest, ) ijk += 1 gc.collect() # print internal metrics results print(Color.BOLD + "*-*-* " + Color.GREEN + f"Internal {self.n_splits}-Folds CV:" + Color.END + Color.BOLD + " -*-*- " + Color.F_Red + f"Train {self.metrics.upper()}" + " = " + f"{np.mean(int_cv_train2):.3f}" + " +/- " + f"{np.std(int_cv_train2):.3f}" + Color.END + Color.BOLD + " -*-*- " + Color.F_Blue + f"Test {self.metrics.upper()}" + " = " + f"{np.mean(int_cv_test2):.3f}" + " +/- " + f"{np.std(int_cv_test2):.3f}" + Color.END + Color.BOLD + " *-*-*") # print external eval_metric results print(Color.BOLD + "*-*-* " + Color.GREEN + f"External {self.n_splits}-Folds CV:" + Color.END + Color.BOLD + " -*-*- " + Color.F_Red + f"Train {self.params['eval_metric'].upper()}" + " = " + f"{np.mean(ext_cv_train2):.3f}" + " +/- " + f"{np.std(ext_cv_train2):.3f}" + Color.END + Color.BOLD + " -*-*- " + Color.F_Blue + f"Test {self.params['eval_metric'].upper()}" + " = " + f"{np.mean(ext_cv_test2):.3f}" + " +/- " + f"{np.std(ext_cv_test2):.3f}" + Color.END + Color.BOLD + " *-*-*\n") # free memory here at iteration del ( int_cv_train2, int_cv_test2, ext_cv_train2, ext_cv_test2, X_permuted, cols, Xval, cv, ) gc.collect() # calling function to get plotting cv results attribute self.plotting_cv_ = self.get_plotting_cv() # pruned features freq self.feature_frequency_ = self._freq() return None