columns = [ 'Age_categories_Missing', 'Age_categories_Infant', 'Age_categories_Child', 'Age_categories_Young Adult', 'Age_categories_Adult', 'Age_categories_Senior', 'Pclass_1', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'SibSp_scaled', 'Parch_scaled', 'Fare_categories_0-12', 'Fare_categories_50-100', 'Fare_categories_100+', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Royalty', 'Cabin_type_B', 'Cabin_type_C', 'Cabin_type_D', 'Cabin_type_E', 'Cabin_type_F', 'Cabin_type_G', 'Cabin_type_T', 'Cabin_type_Unknown' ] all_X = train[columns] all_y = train["Survived"] lr = LogisticRegression() selector = RFECV(lr, cv=10) selector.fit(all_X, all_y) optimized_columns = all_X.columns[selector.support_] ## 10. Training A Model Using our Optimized Columns ## all_X = train[optimized_columns] all_y = train["Survived"] lr = LogisticRegression() scores = cross_val_score(lr, all_X, all_y, cv=10) accuracy = scores.mean() ## 11. Submitting our Model to Kaggle ## lr = LogisticRegression() lr.fit(all_X, all_y)
def logistic_regression(): # ----------------------- Partition Data ----------------------- # df = data_prep() rs = 0 y = df['AtRisk'] X = df.drop(['AtRisk'], axis=1) X_mat = X.as_matrix() X_train, X_test, y_train, y_test = train_test_split(X_mat, y, test_size=0.3, stratify=y, random_state=rs) # ------------------ Scale and Build Model --------------------- # scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) model = LogisticRegression() model.fit(X_train, y_train) print("\nDefault Regression\n-----------------") print("Train accuracy:", model.score(X_train, y_train)) print("Test accuracy", model.score(X_test, y_test)) # --------------------- Default GridSearchCV ---------------------- # params = {'C': [pow(10, x) for x in range(-6, 4)]} cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=5, n_jobs=-1) cv.fit(X_train, y_train) print("\nRegression GridSearchCV\n-----------------") print("Train accuracy:", cv.score(X_train, y_train)) print("Test accuracy:", cv.score(X_test, y_test)) print("\nBest Parameters:") print(cv.best_params_) # y_pred = cv.predict(X_test) # print(classification_report(y_test, y_pred)) # print hyperparameters of model # ---------------------- Log Transform ----------------------- # columns_to_transform = [ 'Age', 'NumYearsEducation', 'CapitalGain', 'CapitalAvg' ] X_log = X.copy() for col in columns_to_transform: X_log[col] = X_log[col].apply(lambda x: x + 1) X_log[col] = X_log[col].apply(np.log) X_mat_log = X_log.as_matrix() X_train_log, X_test_log, y_train_log, y_test_log = train_test_split( X_mat_log, y, test_size=0.3, stratify=y, random_state=rs) scaler_log = StandardScaler() X_train_log = scaler_log.fit_transform(X_train_log) X_test_log = scaler_log.transform(X_test_log) cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=5, n_jobs=-1) cv.fit(X_train_log, y_train_log) print("\nLog Regression GridSearchCV\n-----------------") print("Train accuracy:", cv.score(X_train_log, y_train_log)) print("Test accuracy:", cv.score(X_test_log, y_test_log)) print("\nBest Parameters:") print(cv.best_params_) # ---------------------- Feature Elimination ----------------------- # rfe = RFECV(estimator=LogisticRegression(random_state=rs), cv=5) rfe.fit(X_train_log, y_train_log) print("Original feature set", X_train_log.shape[1]) print("Number of features after elimination", rfe.n_features_) X_train_sel = rfe.transform(X_train_log) X_test_sel = rfe.transform(X_test_log) cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=rs), cv=5, n_jobs=-1) cv.fit(X_train_sel, y_train_log) print( "\nLog Regression GridSearchCV with Feature Elimination\n-----------------" ) print("Train accuracy: ", cv.score(X_train_sel, y_train_log)) print("Test accuracy: ", cv.score(X_test_sel, y_test_log)) print("\nBest Parameters:") print(cv.best_params_)
def go(self, all_data, cols, polynomialColumns): trainingData = all_data.loc[(all_data.SalePrice > 0), cols].reset_index(drop=True, inplace=False) y_train = all_data.SalePrice[all_data.SalePrice > 0].reset_index( drop=True, inplace=False) robustScaler = RobustScaler() robustScalerDataFrame = pd.DataFrame(robustScaler.fit_transform( trainingData[cols]), columns=cols) pValueColumns = cols.values pValueColumns = self.backwardElimination(robustScalerDataFrame, y_train, pValueColumns) lasso = Lasso(alpha=0.0005, tol=0.002) recursiveFeatureEliminator = RFECV(estimator=lasso, n_jobs=-1, step=1, scoring='neg_mean_squared_error', cv=5) recursiveFeatureEliminator.fit(robustScalerDataFrame, y_train) recursivelySelectedFeatures = recursiveFeatureEliminator.get_support() recursiveFeatureSelectedColumns = cols[recursivelySelectedFeatures] r2Score = r2_score lasso = Lasso(alpha=0.0005, tol=0.002) sequentialFeatureSelection = SequentialFeatureSelection( lasso, k_features=1, scoring=r2Score) sequentialFeatureSelection.fit(robustScalerDataFrame, y_train) sequentialFeatureSelectionScoreLength = len( sequentialFeatureSelection.scores_) sequentialFeatureSelectionScoreCriteria = ( sequentialFeatureSelection.scores_ == max( sequentialFeatureSelection.scores_)) arrangedSequentialFeatures = np.arange( 0, sequentialFeatureSelectionScoreLength )[sequentialFeatureSelectionScoreCriteria] maxSequentialFeatureScore = max(arrangedSequentialFeatures) sequentialFeatureSelectionSubsets = list( sequentialFeatureSelection.subsets_[maxSequentialFeatureScore]) sequentialBackwardSelection = list( robustScalerDataFrame.columns[sequentialFeatureSelectionSubsets]) kBestSelection = SelectKBest(score_func=f_regression, k=kBestValue) kBestSelection.fit(robustScalerDataFrame, y_train) select_features_kbest = kBestSelection.get_support() kbestWithFRegressionScoringFunction = cols[select_features_kbest] kBestSelection = SelectKBest(score_func=mutual_info_regression, k=kBestValue) kBestSelection.fit(robustScalerDataFrame, y_train) select_features_kbest = kBestSelection.get_support() kbestWithMutualInfoRegressionScoringFunction = cols[ select_features_kbest] X_train, X_test, y, y_test = train_test_split( robustScalerDataFrame, y_train, test_size=0.30, random_state=randomStateValue) model = XGBRegressor(base_score=0.5, random_state=randomStateValue, n_jobs=4, silent=True) model.fit(X_train, y) bestValue = 1e36 bestColumns = 31 my_model = model threshold = 0 for modelThreshold in np.sort(np.unique(model.feature_importances_)): selectionsFromModel = SelectFromModel(model, threshold=modelThreshold, prefit=True) X_trainSelectedFromModel = selectionsFromModel.transform(X_train) modelForSelection = XGBRegressor(base_score=0.5, random_state=randomStateValue, n_jobs=4, silent=True) modelForSelection.fit(X_trainSelectedFromModel, y) X_testSelectedFromModel = selectionsFromModel.transform(X_test) y_pred = modelForSelection.predict(X_testSelectedFromModel) roundedPredictions = [ round(predictedValue) for predictedValue in y_pred ] meanSquaredErrorValue = mean_squared_error(y_test, roundedPredictions) if (bestValue >= meanSquaredErrorValue): bestValue = meanSquaredErrorValue bestColumns = X_trainSelectedFromModel.shape[1] my_model = modelForSelection threshold = modelThreshold listOfFeatureImportance = [ (score, feature) for score, feature in zip(model.feature_importances_, cols) ] XGBestValues = pd.DataFrame(sorted( sorted(listOfFeatureImportance, reverse=True)[:bestColumns]), columns=['Score', 'Feature']) XGBestColumns = XGBestValues.iloc[:, 1].tolist() unionSetOfBestColumns = set(pValueColumns) unionSetOfBestColumns = unionSetOfBestColumns.union( set(recursiveFeatureSelectedColumns)) unionSetOfBestColumns = unionSetOfBestColumns.union( set(kbestWithFRegressionScoringFunction)) unionSetOfBestColumns = unionSetOfBestColumns.union( set(kbestWithMutualInfoRegressionScoringFunction)) unionSetOfBestColumns = unionSetOfBestColumns.union(set(XGBestColumns)) unionSetOfBestColumns = unionSetOfBestColumns.union( set(sequentialBackwardSelection)) unionSetOfBestColumns = unionSetOfBestColumns.union( set(polynomialColumns)) unionSetOfBestColumnsList = list(unionSetOfBestColumns) return DataObject( self.trainingData, self.testingData, self.combinedData ), unionSetOfBestColumnsList, recursiveFeatureSelectedColumns, XGBestColumns
### Step 6: Recursive Feature Elimination ### Collect features from RF and PC df_pc_gini = pd.merge( df_pc, df_gini, on="Features", how="inner" ) # Join by column while keeping only items that exist in both, select outer or left for other options df_features = df_pc_gini["Features"] # Save features from data frame pc_gini_features = df_features.tolist() # Convert to list df_rfecv = df_step3[pc_gini_features] # Add selected features to df ### Setup RFE model X = df_rfecv # Save features columns as predictor data frame Y = df_step3["outcome"] # Use outcome data frame RFE = LinearRegression() # Use regression coefficient as estimator selector = RFECV( estimator=RFE, min_features_to_select=10 ) # define selection parameters, in this case all features are selected. See Readme for more ifo ### Fit RFE model selected = selector.fit(X, Y) # This will take time ### Collect features from RFE model ar_rfe = selected.support_ # Save Boolean values as numpy array l_rfe = list(zip(X, ar_rfe)) # Create list of variables alongside RFE value df_rfe = pd.DataFrame(l_rfe, columns=[ "Features", "RFE" ]) # Create data frame of importances with variables and gini column names df_rfe = df_rfe[df_rfe.RFE == True] # Select Variables that were True df_rfe = df_rfe.reset_index() # Reset Index df_rfe = df_rfe.filter(["Features"]) # Keep only selected columns
# y_train, params, 'Random Forests') forest = RandomForestRegressor(n_estimators=10, random_state=0, n_jobs=-1).fit(X_train, y_train) y_pred = forest.predict(X_test) error = mean_absolute_error(np.exp(y_test), np.exp(y_pred)) day_pred_error = np.exp(y_test[-1]) - np.exp(y_pred[-1]) print( f'Day prediction error {day_pred_error}, Percent Error {100*(day_pred_error/np.exp(y_test[-1]))}' ) print(f'Random forest mean error: {error}') # plot forests plot_prediction(X_test, y_test, y_pred, 'Random Forest Regression - Snohomish') # feature selection with LinearRegression and cross validation selector = RFECV(LinearRegression(), cv=TimeSeriesSplit(n_splits=5)).fit(X_test, y_test) cols = selector.get_support(indices=True) features = X_train.iloc[:, cols] # cases-1 and cases-7 on snohomish print(f'Chosen Features on Snohomish Dataset: {features.columns}') # # repeat with Elastic Net as a sanity check # selector = RFECV(ElasticNet(), cv=TimeSeriesSplit( # n_splits=5)).fit(X_test, y_test) # cols = selector.get_support(indices=True) # features = X_train.iloc[:, cols] # # should show that cases-1 and cases-7 are the best # print(f'Chosen Features on Snohomish Dataset: {features.columns}')
forest = RandomForestRegressor(n_estimators = 1000, max_depth = 10) #Use default values except for number of trees. For a further explanation see readme included in repository. forest.fit(df_X, df_Y['quant']) # Fit Forest model, This will take time rf = forest.feature_importances_ # Output importances of features l_rf = list(zip(df_X, rf)) # Create list of variables alongside importance scores df_rf = pd.DataFrame(l_rf, columns = ['Feature', 'Gini']) # Create data frame of importances with variables and gini column names df_rf = df_rf[(df_rf['Gini'] > df_rf['Gini'].mean())] # Subset by Gini values higher than mean df_rf = df_rf.sort_values(by = ['Gini'], ascending = False) # Sort Columns by Value df_rf.info() # Get class, memory, and column info: names, data types, obs. ### Fracture: Join RF and PCA df_fr = pd.merge(df_pca, df_rf, on = 'Feature', how = 'inner') # Join by column while keeping only items that exist in both, select outer or left for other options fracture = df_fr['Feature'].tolist() # Save features from data frame df_fr.info() # Get class, memory, and column info: names, data types, obs. ### Recursive Feature Elimination recursive = RFECV(estimator = LinearRegression(), min_features_to_select = 5) # define selection parameters, in this case all features are selected. See Readme for more ifo recursive.fit(df_X[fracture], df_Y['quant']) # This will take time rfe = recursive.support_ # Save Boolean values as numpy array l_rfe = list(zip(df_X[fracture], rfe)) # Create list of variables alongside RFE value df_rfe = pd.DataFrame(l_rfe, columns = ['Feature', 'RFE']) # Create data frame of importances with variables and gini column names df_rfe = df_rfe.sort_values(by = ['RFE'], ascending = True) # Sort Columns by Value df_rfe = df_rfe[df_rfe['RFE'] == True] # Select Variables that were True df_rfe.info() # Get class, memory, and column info: names, data types, obs. ### FractureProof: Join RFE with Fracture df_fp = pd.merge(df_fr, df_rfe, on = 'Feature', how = 'inner') # Join by column while keeping only items that exist in both, select outer or left for other options fractureproof = df_fp['Feature'].tolist() # Save chosen featres as list df_fp.info() # Get class, memory, and column info: names, data types, obs. ### Get FractureProof feature labels df_lfp = df_l1_l2[fractureproof] # Save chosen featres as list
# features = scaler.fit_transform(features) # features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 0) features_train, features_test, labels_train, labels_test = train_test_split( features_resampled, labels_resampled, test_size=0.3, random_state=0) from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFECV print "RandomForestClassifier " # RFECV: Select the algorithm to train with: clf_Ranking = RFECV(GradientBoostingClassifier(random_state=0, learning_rate=0.05, max_depth=1), scoring='accuracy', n_jobs=-1) # RFECV: Fit and transform the RFECV function clf_Ranking.fit_transform(features_train, labels_train) print clf_Ranking.score(features_train, labels_train) print clf_Ranking.ranking_ # result of feature selection : [ 1 13 4 14 1 12 11 8 1 9 5 6 1 2 10 7 3 1] # [1 4 5 1 1 1 1 1 3 1 1 1 6 2 1 1 1 1] # [14 5 1 11 1 10 4 1 1 1 6 3 2 9 8 12 13 7 1] #print scores # GBC : [13 12 11 10 3 1 1 9 1 1 1 8 1 7 6 2 4 5 1 1] # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
rfe = RFE(estimator=clf_rf_3, n_features_to_select=15, step=1) rfe = rfe.fit(x_train, y_train) # In[140]: print('Chosen best 15 feature by rfe:', x_train.columns[rfe.support_]) # Recursive feature elimination with cross validation and random forest classification # In[141]: from sklearn.feature_selection import RFECV # The "accuracy" scoring is proportional to the number of correct classifications clf_rf_4 = RandomForestClassifier() rfecv = RFECV(estimator=clf_rf_4, step=1, cv=5, scoring='accuracy') #5-fold cross-validation rfecv = rfecv.fit(x_train, y_train) print('Optimal number of features :', rfecv.n_features_) print('Best features :', x_train.columns[rfecv.support_]) # In[142]: # Plot number of features VS. cross-validation scores import matplotlib.pyplot as plt plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score of number of selected features") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def pipeline_feature_selection(fs_data, training_data, test_data, calibration_data, verbose): from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint as sp_randint from sklearn.model_selection import GroupKFold from sklearn.feature_selection import RFECV import numpy as np import pandas as pd import matplotlib.pyplot as plt feature_to_drop = ['Discharge_Q', 'SOH_discharge_capacity', 'Group'] feature_to_predict = 'Discharge_Q' X_train = fs_data.drop(feature_to_drop, axis=1) y_train = fs_data[feature_to_predict] no_of_features = 1 # number of features to drop after each iteration # Hyper-param for Random Forest # 10*len(list(X_train)) # used to have boostrap = 500 rf_tuning = RandomForestRegressor( n_estimators=500, bootstrap=True, n_jobs=-1) #full_dataset: 700 estimators, 500 boostraps #standford_dataset: 70 estimators # note: oxford uses: 250 est with 1- iter # standford uses: 250 est with 50 iter param = { "max_depth": sp_randint(15, 25), #15-25, 5-10 "max_features": [no_of_features], #[no_of_features], # sp_randint(2, 4), "min_samples_split": sp_randint(2, 5), "min_samples_leaf": sp_randint(5, 15), "criterion": ['mse'] } # no_top_models = 5 no_of_splits = len(np.unique( fs_data.Group)) # number of slits is equal to the number of groups groups = fs_data.Group group_kfold = GroupKFold( n_splits=no_of_splits) # inner test and train using the group KFold model = RandomizedSearchCV( rf_tuning, param_distributions=param, cv=group_kfold, n_iter=100, # full_dataset: 150 iid=False, refit=True, verbose=verbose) model.fit(X_train, y_train, groups=groups) RF_f_selection_model = model.best_estimator_ # RF_f_selection_model_param = model.best_params_ # '''Recurrent Feature Elimination''' names = list( fs_data.drop(['Discharge_Q', 'SOH_discharge_capacity', 'Group'], axis=1)) rf = RF_f_selection_model rfe = RFECV(estimator=rf, min_features_to_select=no_of_features, cv=group_kfold, step=1, scoring='neg_mean_squared_error', verbose=verbose) # neg_mean_squared_error, r2 # selector_RF = rfe.fit(X_train_scaled, y_train) selector_RF = rfe.fit(X_train, y_train, groups=groups) ranking_features = sorted(zip( map(lambda x: round(x, 4), selector_RF.ranking_), names), reverse=False) optimumum_no_feature = selector_RF.n_features_ x = range(no_of_features, len(selector_RF.grid_scores_) + no_of_features) y = selector_RF.grid_scores_ '''feature selection resuts''' print('Feature rank: \n {}'.format(ranking_features)) # Plot number of features VS. cross-validation scores f = plt.figure(figsize=(7, 5)) plt.xlabel("Number of features selected") plt.ylabel("Cross validation score") plt.plot(x, y, 'o--', color='tab:orange') plt.plot(x[np.argmax(y)], np.max(y), 'v', markersize=15, color='k') # plt.title('Optimum number of features based RF-RFE using neg-mse is: {}'.format(optimumum_no_feature)) plt.xlabel('Selected no. of features', fontsize=15) plt.ylabel('Cross-validation score [Negative MSE]', fontsize=15) plt.xticks(fontsize=15) plt.yticks(fontsize=15) plt.grid(False) plt.show() # transform dataset based on optimum features training_data_opt_fet_X = pd.DataFrame( selector_RF.transform(training_data.drop( feature_to_drop, axis=1))) # input feature space training as DataFrame test_data_opt_fet_X = pd.DataFrame( selector_RF.transform(test_data.drop( feature_to_drop, axis=1))) # input feature space testing as DataFrame calibration_data_opt_fet_X = pd.DataFrame( selector_RF.transform(calibration_data.drop( feature_to_drop, axis=1))) # input feature space testing as DataFrame '''Adding 'Group' feature to the dataset''' # add the group so that you can re-tune future models based on # training_data_opt_fet_X_new = pd.concat([training_data_opt_fet_X, training_data.Group], axis=1) training_data_opt_fet_X['Group'] = np.array(training_data.Group) # test_data_opt_fet_X_mew = pd.concat([test_data_opt_fet_X, test_data.Group], axis=1) test_data_opt_fet_X['Group'] = np.array(test_data.Group) # calibration_data_opt_fet_X_mew = pd.concat([calibration_data_opt_fet_X, calibration_data.Group], axis=1) calibration_data_opt_fet_X['Group'] = np.array(calibration_data.Group) train_y = training_data[feature_to_predict] test_y = test_data[feature_to_predict] calibration_y = calibration_data[feature_to_predict] return optimumum_no_feature, ranking_features, training_data_opt_fet_X, test_data_opt_fet_X, calibration_data_opt_fet_X, train_y, test_y, calibration_y, f
ignored_fields.append(c[0]) X = X.drop(ignored_fields, axis=1) columns = list(X.columns.values) # Standard scaler sc_X = StandardScaler() X = sc_X.fit_transform(X) # Select top 20 features estimator = ltb.LGBMRegressor() #rfe = RFE(estimator=estimator, n_features_to_select=40, step=10, verbose=1) rfe = RFECV(estimator=estimator, min_features_to_select=30, step=10, cv=4, n_jobs=1, verbose=1) rfe.fit(X, y) features = rfe.get_support(indices=True) np.array(columns)[features] """ ['NUM_SELLING_DAYS_0', 'NUM_DAYS_0', 'days_btw_order_0', 'num_orders_0', 'QUOTA_SELLIN_0', 'NUM_SELLING_DAYS_1', 'NUM_DAYS_1', 'days_btw_order_1', 'num_orders_1', 'QUOTA_SELLIN_1', 'NUM_SELLING_DAYS_2', 'NUM_DAYS_2', 'days_btw_order_2', 'num_orders_2', 'QUOTA_SELLIN_2', 'NUM_SELLING_DAYS_3', 'NUM_DAYS_3', 'days_btw_order_3', 'num_orders_3', 'QUOTA_SELLIN_3', 'NUM_SELLING_DAYS_4',
def RepeatCV(times, Clinical, NumOfFea, Criteria, Method, NumFold=10): ''' k-fold CV. In each fold, the training set are further split into training and validation set. The training set is used to build the model. The validation set is used to determine the CUTOFF in order to ensure the precision. Once the cutoff is determined, it will be used on testing set to obtain precision and PPR. :param times: Number of repetitions :param Clinical: Which variable to predict. Should be index of columns. :param NumOfFea: Number of genes to be selected :param Criteria: the score used in building model, Possible options: FSCORE, PPR90, PPR95 :param Method: RN_only, RN_SMOTE, no_RN_no_SMOTE :param NumFold: number of folds in k-fold CV :return: F05score_fold, PPR_fold, PRECISION_fold, F05PPR_fold, AUROC_fold, valid_Precision ''' workDir = "/gpfs/home/dz16e/Reusability/NewExperiment/" if Criteria == 'FSCORE': myscorer = make_scorer(fbeta_score, beta=1.0) target_Precision = 0.9 elif Criteria == 'FSCORE_95': myscorer = make_scorer(fbeta_score, beta=1.0) target_Precision = 0.95 elif Criteria == 'PPR90': myscorer = make_scorer(PPR_percentile_score, needs_proba=True, Precision=0.9, Return_cutoff=0) target_Precision = 0.9 elif Criteria == 'PPR95': myscorer = make_scorer(PPR_percentile_score, needs_proba=True, Precision=0.95, Return_cutoff=0) target_Precision = 0.95 if Method in ['RN_only', 'RN_SMOTE']: X_data = np.loadtxt(workDir + 'TCGA_Data/Predictors/' + 'predictor_rank.txt') elif Method in ['no_RN_no_SMOTE']: X_data = np.loadtxt(workDir + 'TCGA_Data/Predictors/' + 'predictor.txt') Y = np.loadtxt(workDir + 'TCGA_Data/Responses/' + 'response.txt') CLI = Clinical Y_data = Y[:, CLI] X_data = X_data[Y_data < 2, :] Y_data = Y_data[Y_data < 2] F05score_fold = np.zeros([NumFold, 4]) #because we have 4 different models PPR_fold = np.zeros([NumFold, 4]) AUROC_fold = np.zeros([NumFold, 4]) PRECISION_fold = np.zeros([NumFold, 4]) F05PPR_fold = np.zeros([NumFold, 4]) valid_Precision = np.zeros([NumFold, 4]) valid_PPR = np.zeros([NumFold, 4]) Pred_Prob_test = np.zeros([X_data.shape[0], 6]) CUTOFF = np.zeros([NumFold, 4]) kf = StratifiedKFold(n_splits=NumFold, shuffle=True, random_state=times * 10 + 1) val = 0 for train_idx, test_idx in kf.split(X_data, Y_data): train_X = X_data[train_idx, :] train_Y = Y_data[train_idx] test_X = X_data[test_idx, :] test_Y = Y_data[test_idx] Pred_Prob_test[test_idx, 0] = test_Y Pred_Prob_test[test_idx, 1] = val + 1 # Filter out genes using T-test DEGs = Filter_Ttest(train_X, train_Y, significant_level=0.1) train_X = train_X[:, DEGs] test_X = test_X[:, DEGs] if Method == 'RN_SMOTE': smote = SMOTE(random_state=2020) smox, smoy = smote.fit_sample(train_X, train_Y) isSMOTE = 1 else: smox, smoy = train_X, train_Y isSMOTE = 0 Clf_name = ['LASSO', 'RF', 'XGB', 'SVM'] Classifiers = { 'LASSO': LogisticRegression(C=1, penalty='l1', solver='liblinear', random_state=2020), 'RF': RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=2020), 'XGB': XGBClassifier(learning_rate=0.05, n_estimators=100, max_depth=5, seed=2020), 'SVM': SVC(C=0.01, probability=True, kernel='linear', random_state=2020, max_iter=10000) } Parameters = { 'LASSO': { 'C': [.01, .05, .1, .5, 1.0, 5.0, 10.0], 'fit_intercept': [True, False] }, 'RF': { 'criterion': ['gini', 'entropy'], 'max_depth': sp_randint(1, 11) }, 'XGB': { 'learning_rate': [.01, .05, .1, .5], 'max_depth': sp_randint(1, 11), 'min_child_weight': [1, 2, 3] }, 'SVM': { 'C': [.0001, .001, .005, .01, .05, .1, .5, 1.0, 3.0, 5.0, 10.0] } } # ----------------------------------------------------------- # Feature Selection for idx_clf in range(4): os.system("echo 'Starting " + Clf_name[idx_clf] + "...'") estimator = Classifiers[Clf_name[idx_clf]] if NumOfFea == 0: selector = RFECV(estimator, step=0.2, cv=3, scoring=myscorer) else: selector = RFE(estimator, NumOfFea, step=0.2) selector = selector.fit(smox, smoy) smox_reduced = selector.transform(smox) test_X_reduced = selector.transform(test_X) cv_parameter = Parameters[Clf_name[idx_clf]] searcher = RandomizedSearchCV(estimator, cv_parameter, scoring=myscorer, random_state=2020).fit( smox_reduced, smoy) Pred_test = searcher.predict(test_X_reduced) Prob_test = searcher.predict_proba(test_X_reduced)[:, 1] F05score_fold[val, idx_clf] = fbeta_score(test_Y, Pred_test, 1.0) percentile_CUTOFF, valid_Precision[val, idx_clf], valid_PPR[val, idx_clf] = \ PPR_percentile_findCutoff(smox_reduced, smoy, searcher.best_estimator_, target_Precision, 10, isSMOTE) PPR_fold[val, idx_clf] = recall_score( test_Y, Prob_test >= np.percentile(Prob_test, percentile_CUTOFF)) PRECISION_fold[val, idx_clf] = \ precision_score(test_Y, Prob_test >= np.percentile(Prob_test, percentile_CUTOFF)) F05PPR_fold[val, idx_clf] = \ fbeta_score(test_Y, Prob_test >= np.percentile(Prob_test, percentile_CUTOFF), beta=1.0) AUROC_fold[val, idx_clf] = roc_auc_score(test_Y, Prob_test) Pred_Prob_test[test_idx, idx_clf + 2] = Prob_test CUTOFF[val, idx_clf] = percentile_CUTOFF val += 1 return F05score_fold, PPR_fold, PRECISION_fold, F05PPR_fold, AUROC_fold, valid_Precision, valid_PPR, \ Pred_Prob_test, CUTOFF
excel_name = "Scores/%i_SVC_Score (%i sec).csv" % (time_Start, elapsed) scores.to_csv(excel_name) # In[Feature selections] from sklearn.feature_selection import RFECV from sklearn.svm import LinearSVC time_Start = time.time() days = [1] #stocks = tickers stocks = ['OMXIPI'] features_to = petur.get_tickers() features_to.append(stocks) clf = LinearSVC() clf = RFECV(clf, step=1, cv=10, n_jobs=-1) feature_dic = {} for stock in stocks: acc = [] for day in days: name = "%s-%s" % (stock, day) training = df_train.copy() training = training[features_to] x_train, y_train = petur.create_better_labels(stock, training, day) clf = clf.fit(x_train, y_train) output = clf.support_ feature_dic[name] = output
print(classification_report(y_test, y_prediction)) print(confusion_matrix(y_test, y_prediction)) #SVM Model X= np.array(df.drop(['region_id'],1)) y= np.array(df['region_id']) X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,y,test_size=0.2) clf = svm.SVC() clf.fit(X_train,y_train) svm_accuracy = clf.score(X_test,y_test) #SVM evaluate: print('The accuracy of the SVM test was :') print(svm_accuracy) y_prediction = clf.predict(X_test) print(classification_report(y_test, y_prediction)) print(confusion_matrix(y_test, y_prediction)) #Random Forest #Use Recursive Feature Elimination X= np.array(df.drop(['region_id'],1)) y= np.array(df['region_id']) X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,y,test_size=0.2) m = RFECV(RandomForestClassifier(n_jobs=-1), scoring='accuracy',verbose=1) m.fit(X, y) k = m.score(X, y) print("Using recursive feature elimination of a random forest, best model produces the folloiwng accuracy: ") print(k)
model.fit(xt_train, y_train) # Make predictions on test data and look at the results. preds = model.predict(xt_test) pprint.pprint(pd.DataFrame({'Actual': y_test, 'Predicted': preds})) print('MSE, MAE, R^2, EVS (Top 3 Model): ' + \ str([mean_squared_error(y_test, preds), \ median_absolute_error(y_test, preds), \ r2_score(y_test, preds), \ explained_variance_score(y_test, preds)])) # ---------------- Part 3: Use Recursive Feature Elimination with Cross Validation - # Use RFECV to arrive at the approximate best set of predictors. RFECV is a greedy method. selector_f = RFECV(estimator=linear_model.LinearRegression(), \ cv=5, scoring=make_scorer(r2_score)) selector_f.fit(x_train, y_train) # Get the columns of the best 25% features. xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test) # Create a least squares linear regression model. model = linear_model.LinearRegression() # Fit the model. model.fit(xt_train, y_train) # Make predictions on test data and look at the results. preds = model.predict(xt_test) pprint.pprint(pd.DataFrame({'Actual': y_test, 'Predicted': preds}))
### Extracting features and labels from dataset for local testing data = featureFormat(my_dataset, features_list_original, sort_keys = True) labels, features = targetFeatureSplit(data) ##################### # FEATURE SELECTION # ##################### import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV lr = LogisticRegression() ### Optimal number of features: rfecv = RFECV(estimator=lr, step=1, cv=StratifiedKFold(labels, 3), scoring='precision') rfecv.fit(features, labels) print("Optimal number of features : %d" % rfecv.n_features_) #Answer: 5 ''' ### Plotting number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() ''' ### Choosing the 5 most important features
print(rfe.ranking_) # Plot pixel ranking #plt.matshow(ranking, cmap=plt.cm.Blues) #plt.colorbar() #plt.title("Ranking of pixels with RFE") #plt.show() # Create the RFE object and compute a cross-validated score. svc = SVR(kernel="linear") min_features_to_select = 1 # Minimum number of features to consider rfecv = RFECV( estimator=svc, step=1, cv=KFold(n_splits=5), scoring="r2", min_features_to_select=min_features_to_select, ) rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (r2)") plt.plot( range(min_features_to_select, len(rfecv.grid_scores_) + min_features_to_select), rfecv.grid_scores_,
############################################################## ###Features Selection RFE with the logistic regression estimator features = [ 'FamilyCateg', 'Age', 'Embarked_S', 'Embarked_Q', 'Embarked_C', 'Other', 'Mr', 'Mrs', 'Miss', 'Master', 'Fare', 'SibSp', 'Parch', 'Sex', 'Pclass' ] X = train_data[features].values Y = train_data['Survived'].values from sklearn.feature_selection import RFECV from sklearn.linear_model import LogisticRegression from sklearn.model_selection import StratifiedKFold estimator = LogisticRegression() selector = RFECV(estimator, step=1, cv=StratifiedKFold(2), scoring='accuracy') selector = selector.fit(X, Y) print("Optimal number of features : %d" % selector.n_features_) selected_features = [] for i in range(len(features)): if selector.support_[i] == True: selected_features.append(features[i]) print(selected_features) ###########################################################"""" #Variable to display final scores of the following models models = [] final_scores = [] #Logistic Regression
(target_left - target_viewport))).reshape( 1, dataset.shape[0]) X = np.array([dataset[:, index] for index in features_index], dtype=float).T X = np.concatenate((X, diff_out_viewport_left.T, diff_out_viewport_right.T), axis=1) X_cross = np.array([dataset[:, index] for index in crosscheck_index], dtype=float).T y = np.array(dataset[:, class_index], dtype=int) urls = dataset[:, url_index] random.seed(42) model = tree.DecisionTreeClassifier(criterion='entropy', random_state=42) #model = ensemble.RandomForestClassifier(criterion='entropy', random_state=42) X_new = X rfecv = RFECV(model, cv=GroupKFold(n_splits=5), scoring='f1_macro') rfecv.fit(X, y, groups=urls) X_new = rfecv.transform(X) print(X.shape) print(X_new.shape) headers.append('diff_out_viewport_left') headers.append('diff_out_viewport_right') features_index.append(headers.index('diff_out_viewport_left')) features_index.append(headers.index('diff_out_viewport_right')) print([ headers[features_index[i]] for i in range(0, len(rfecv.ranking_)) if rfecv.ranking_[i] == 1 ]) params = { # 'n_estimators': [1, 5, 10, 20, 50],
def predict_features(self, df_features, df_target, idx=0, **kwargs): estimator = SVR(kernel='linear') selector = RFECV(estimator, step=1) selector = selector.fit(df_features.values, df_target.values[:, 0]) return selector.grid_scores_
print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() #------------------------------------------------------------------------------ #Permet de selectionner knn = RandomForestClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=12, n_estimators=134, n_jobs=-1) # classifications rfecv = RFECV(estimator=knn, step=3, cv=StratifiedKFold(2), scoring='roc_auc') rfecv.fit(X_LS, y_LS) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
from sklearn.feature_selection import RFECV from xgboost import XGBClassifier from numpy import loadtxt import numpy as np from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split dataset = loadtxt('../Dataset/heart.data', delimiter=",") # split data into X and y X = dataset[:, 0:np.array(dataset).shape[1] - 1] Y = dataset[:, np.array(dataset).shape[1] - 1] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.23, random_state=22) #use linear regression as the model xg = XGBClassifier() #rank all features, i.e continue the elimination until the last one rfe = RFECV(xg, cv=5, step=1) rfe.fit(X_train, y_train) y_important_pred = rfe.predict(X_test) print("Features sorted by their rank:") print(sorted(zip(map(lambda x: x, rfe.ranking_)))) print(sorted(zip(map(lambda x: x, rfe.support_)))) print(accuracy_score(y_test, y_important_pred.round()) * 100)
clf = RandomForestClassifier(2000, n_jobs=4) cv = StratifiedKFold(n_splits=20, shuffle=True) y_pred = cross_val_predict(clf, data_x, data_y, cv=cv, method='predict_proba', n_jobs=4, verbose=10) misc.save_results(data_y, y_pred, name, ss=ss, clf=clf) stop # %% feat select selector = RFECV(clf, cv=5, n_jobs=-1, verbose=100, step=1, scoring='f1') selector.fit(data_x, data_y) selector.grid_scores_ print( f'These were the {selector.n_features_} features that were deemed important' ) print([feature_names[i] for i in np.nonzero(selector.support_)[0]]) #%% feature importances clf = RandomForestClassifier(2000, n_jobs=-1) # create feature importances clf.fit(data_x, data_y) ranking = clf.feature_importances_ type_importances = np.zeros(len(feature_types))
from sklearn.svm import SVC import matplotlib.pyplot as plt from sklearn.feature_selection import RFECV from sklearn.model_selection import StratifiedKFold import numpy as np from common.import_data import ImportData if __name__ == "__main__": data_set = ImportData() x: np.ndarray = data_set.import_all_data() y: np.ndarray = data_set.import_columns(np.array(['Class'])) svc = SVC(kernel="linear") rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfecv.fit(x, y.ravel()) print("Optymalna liczba cech : %d" % rfecv.n_features_) plt.figure() plt.xlabel("Liczba wybranych cech") plt.ylabel("Wynik walidacji krzyżowej") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
poly_reg = PolynomialFeatures(degree=2) X_temp_train_q = poly_reg.fit_transform(X_temp_train[feature_names_quant]) poly_feature_names_quant = poly_reg.get_feature_names(feature_names_quant) #Selecting by univariate measures, get top 50 percentile #removes any highly correlated variables #uses greedy search to choose which variables to keep by f-score and p-value selector_f = SelectPercentile(f_regression, percentile=50) selector_f.fit(X_temp_train_q, y_temp_train) for n,s,p in zip(poly_feature_names_quant, selector_f.scores_, selector_f.pvalues_): print ("F-score: %3.2f, p-value: %3.2f for feature %s " % (s,p,n)) regression = LinearRegression() greedy_selector = RFECV(estimator=regression, cv=10, scoring='neg_mean_squared_error') greedy_selector.fit(X_temp_train_q, y_temp_train) print('Optimal number of features: %d' % greedy_selector.n_features_) #create dataframe with optimal polynomial features X_temp_train_df = pd.DataFrame(X_temp_train_q) X_temp_train_df.columns = poly_feature_names_quant X_temp_train_df = X_temp_train_df.loc[:, greedy_selector.support_] age_regressor = LinearRegression() age_regressor = age_regressor.fit(X_temp_train_df, y_temp_train) #prep X test data, perform polynomial features selection then only use features as already output by #greedy selection X_temp_test_q = pd.DataFrame(poly_reg.fit_transform(X_temp_test[feature_names_quant]))
standardize_X(X) # list to hold features selected by the algorithm selections = [] # 100 times do... for _ in range(100): # shuffle X and y X,y = shuffle_X_and_y(X,y) # choose the underlying model. to use log. reg. change which line is commented out model = SVC(kernel="linear", C = 0.5) #model = LogisticRegression(C = 0.5) # Create the RFECV object and compute a cross-validated score. # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(4), scoring='accuracy') rfecv.fit(X, y) # create list of selected features for this iteration mask = rfecv.support_ chosen = [i for i in range(len(mask)) if mask[i]] # add this to selections selections += chosen # once complete, create a dict to hold occurence counts for each feature and populate fcount = {} for f in selections: if f not in fcount: fcount[f] = 1 else: fcount[f] += 1
# Test accuracy acc = accuracy_score(y_test, est.predict(X_test)) print('Test Accuracy {}'.format(acc)) # Plot confusion matrix cm = confusion_matrix(y_test, est.predict(X_test)) sns.heatmap(cm, fmt='d', cmap='GnBu', cbar=False, annot=True) #============================================================================== # Recursive Feature Selection #============================================================================== from sklearn.feature_selection import RFECV # RFE rfe = RFECV(estimator=LogisticRegression(), cv=4, scoring='accuracy') rfe = rfe.fit(X_train, y_train) # Select variables and calulate test accuracy cols = X_train.columns[rfe.support_] acc = accuracy_score(y_test, rfe.estimator_.predict(X_test[cols])) print('Number of features selected: {}'.format(rfe.n_features_)) print('Test Accuracy {}'.format(acc)) # Plot number of features vs CV scores plt.figure() plt.xlabel('k') plt.ylabel('CV accuracy') plt.plot(np.arange(1, rfe.grid_scores_.size+1), rfe.grid_scores_) plt.show()
'estimator': [AdaBoostClassifier(random_state=1986)], 'estimator__n_estimators': [3, 10], }, { 'estimator': [GradientBoostingClassifier(random_state=1986)], 'estimator__criterion': ['friedman_mse', 'mse', 'mae'], 'estimator__n_estimators': [3, 10], #'estimator__max_depth': [None, 3, 5], #'estimator__loss': ['deviance', 'exponential'], }, { 'estimator': [SVC(kernel="linear", C=0.025, random_state=1986)], }] #Feature Selection nas mesmas condições de classificador e folders rfecv = RFECV(estimator=None, step=1, cv=kfold, scoring=score) #Faz o processamento de treinamento com Tuning e Feature Selection gridSearch = GridSearchCV(rfecv, paramGrid, scoring=score, n_jobs=3, verbose=25) gridSearch.fit(xTreino, yTreino) classifier = gridSearch.best_estimator_ indFeatures = np.where(classifier.support_ == True)[0] print('\nMelhor estimador: %s' % gridSearch.best_estimator_) print('Melhor parametrização: %s' % gridSearch.best_params_) print('Melhor pontuação: %.2f' % gridSearch.best_score_) print('Qtde features selecionadas: ', len(indFeatures)) #K-fold print('========== VALIDAÇÃO MÉTODO K-FOLD ==========')
def main(): input_file = "tcd ml 2019-20 income prediction training (with labels).csv" # Uncomment to find current path directory, for debuggin reasons # print(os.path.isfile(input_file)) df = pd.read_csv(input_file, header=0) # (df,ohe) = clean_data(df) # remove nans in column # df = df[np.isfinite(df['Year of Record'])] train, test = train_test_split(df, test_size=0.2) # train = process_features(train) # #Output state of df to csv for debugging reasons # testing_file = "Testing.csv" # train.to_csv(testing_file, index=False) scaler = StandardScaler() (train, ohe, rep_points) = clean_train_data(train) # testing_file = "Testing.csv" # train.to_csv(testing_file, index=False) print(np.shape(train)) print(np.shape(test)) #train = reject_all_outliers # To potentially threshold the features based on variance of values in column # sel = VarianceThreshold(threshold=(.8 * (1 - .8))) train_y = train.iloc[:, -1] # print(train_y.columns) # print(train_y["Income in EUR"]) # To discretize incomes # train_y = pd.DataFrame(train_y,columns = ["Income in EUR"]) # train_y["Income in EUR"] = incomeBinner.fit_transform(train_y[["Income in EUR"]]) # train_y["Income in EUR"] = incomeBinner.inverse_transform(train_y[["Income in EUR"]]) # train_y = train_y.iloc[:,-1] train_X = scaler.fit_transform(train.iloc[:, :-1]) # pca = PCA(svd_solver='auto', n_components="mle") # pca.fit(train_X) # n_components_pca_mle = pca.n_components_ # print("best n_components by PCA MLE = %d" % n_components_pca_mle) # train_X = pca.transform(train_X) # train_X = sel.fit_transform(train_X) test_y = test.iloc[:, -1] test_X = test.iloc[:, :-1] test_X = clean_data(test_X, ohe, rep_points) test_X = scaler.transform(test_X) # test_X = pca.transform(test_X) # test_X = sel.transform(test_X) print(np.shape(train_X)) print(np.shape(test_X)) # # catboost = CatBoostRegressor( # task_type="GPU", # devices='0:1') regr = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance') lasso = linear_model.LassoCV(cv=5, verbose=0) # model = linear_model.SGDRegressor() # # Grid search - this will take about 1 minute. # param_grid = { # 'alpha': 10.0 ** -np.arange(1, 7), # 'loss': ['squared_loss', 'huber', 'epsilon_insensitive'], # 'penalty': ['l2', 'l1', 'elasticnet'], # 'learning_rate': ['constant', 'optimal', 'invscaling'], # } # sgd = GridSearchCV(model, param_grid) # regr = DecisionTreeRegressor() # Good performance # trees = ExtraTreesRegressor() # regr = linear_model.MultiTaskElasticNetCV(cv=5) #lasso = linear_model.LassoLarsCV(cv=5) # regr = linear_model.ElasticNetCV(cv=5) #regr = ensemble.RandomForestRegressor(n_estimators=1000) # regr = ensemble.GradientBoostingRegressor(n_estimators=1000, subsample=0.5) # regr = ensemble.VotingRegressor(estimators=[('knn', knn), ('lr', sgd)]) # regr = linear_model.SGDRegressor(alpha =0.0001,average=False,early_stopping=False, # epsilon=0.1,eta0=0.0001,fit_intercept=True,l1_ratio=0.15,learning_rate='invscaling', # loss='squared_loss',max_iter=1000,n_iter_no_change=5,penalty='l2',power_t=0.25, # random_state=None,shuffle=True,tol=0.001,validation_fraction=0.1,verbose=2, # warm_start=False) #regr = linear_model.RidgeCV(cv=5) # regr = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1, # max_depth = 5, alpha = 10, n_estimators = 10) # regr = svm.SVR(gamma='scale') # Never use this it, takes too long and requires meta transofrmer as well selector = RFECV(lasso, step=1, cv=3) # This works ok but requires LassoCV basically #selector = SelectFromModel(regr) # Very bad performance #selector = cluster.FeatureAgglomeration(n_clusters=150) # the worst performance so far :((( #selector = SelectFromModel(ExtraTreesRegressor(n_estimators=100)) # Performed ok #selector = SelectKBest(f_regression,k = 150) train_X = selector.fit_transform(train_X, train_y) test_X = selector.transform(test_X) print(np.shape(train_X)) print(np.shape(test_X)) # Train the model using the training sets regr.fit(train_X, train_y) # Make predictions using the testing set pred = regr.predict(test_X) X = test["Country"] le = LabelEncoder() X = le.fit_transform(X) plt.scatter(X, test_y, color='red', alpha=0.005) plt.scatter(X, pred, color='blue', alpha=0.005) plt.ylabel('Income') plt.xlabel('Country') plt.title('Predicting income') plt.show() # catboost.fit(train_X, train_y) # catpreds = catboost.predict(test_X) # The coefficients # print('Coefficients: \n', regr.coef_) # The mean squared error print("Root Mean squared error: %.2f" % sqrt(mean_squared_error(test_y, pred))) # Explained variance score: 1 is perfect prediction print('Variance score: %.2f' % r2_score(test_y, pred))
def __init__(self, estimator): self._rfecv = RFECV(estimator=estimator, cv=StratifiedKFold(5), scoring='recall')
for model in models: print(model) rfe = RFE(model, n_features_to_select=1, verbose=2) fit = rfe.fit(df_raw_train[x_columns], df_raw_train[y_column]) for number in [x * 10 for x in list(range(1, 15))]: indexes_to_delete = [] for i in range(len(fit.ranking_)): if(fit.ranking_[i] > number): indexes_to_delete.append(i) selected_features = [i for j, i in enumerate( x_columns) if j not in indexes_to_delete] selected_features_dict[str(number) + "_" + str(model)] = selected_features print("-------\nCV\n---------") rfe = RFECV(estimator=model, verbose=2) fit = rfe.fit(df_raw_train[x_columns], df_raw_train[y_column]) indexes_to_delete = [] for i in range(len(fit.ranking_)): if(fit.ranking_[i] != 1): indexes_to_delete.append(i) selected_features = [i for j, i in enumerate( x_columns) if j not in indexes_to_delete] selected_features_dict["A_" + str(model)] = selected_features selected_features_dict_new = {} for k in selected_features_dict.keys(): new_key = ' '.join(str(k).replace("\\n", " ").split()) selected_features_dict_new[new_key] = selected_features_dict[k]