def select_rfe(X, y, k_features=3):
    '''
    Signature: rfe(predictors, target, k_features=3)
    Docstring:

    Parameters
    ----------
    pandas.core.frame.DataFrame

    Returns
    -------

    '''
    lm = LinearRegression()
    rfe_init = rfe(lm, k_features)

    rfe_init.fit(X, y)
    rfe_mask = rfe_init.support_
    rfe_features = X.iloc[:, rfe_mask].columns.to_list()

    print(f"Recursive Feature Elimination: {len(rfe_features)} features")
    print(rfe_features)
    return None
                                   
#retain unscaled features
post2000_exp = pd.concat([numfeat_post, catfeat_post_bin],axis = 1)
post2000_exp = post2000_exp[pre2000_exp.columns]



#BUILD DECISION TREE MODEL (SCALED DATA)---------------------------------------

#instantiate decision tree model with coeff
dt = TreeClassifierWithCoef(criterion = 'gini', splitter = 'best', max_features = None, 
                              max_depth = None, min_samples_split = 2, min_samples_leaf = 2, 
                              max_leaf_nodes = None, random_state = 1)

#conduct recursive feature search
dt_rfe_cv = rfe(estimator=dt, step=1, cv=10, scoring='roc_auc', verbose = 1)
dt_rfe_cv.fit(pre2000_exp_scaled, pre2000_res)

#identify and plot optimal number of features (d = 17), ROC_AUC = 0.8262
print dt_rfe_cv.n_features_
print dt_rfe_cv.grid_scores_.max()

plt.figure()
plt.xlabel("DT: Number of Features selected")
plt.ylabel("DT: Cross Validation Score (ROC_AUC)")
plt.plot(range(1, len(dt_rfe_cv.grid_scores_) + 1), dt_rfe_cv.grid_scores_)
plt.show()

#identify selected features
dt_features = pre2000_exp_scaled.columns[dt_rfe_cv.get_support()]
print dt_features
Пример #3
0
    df_gini, df_pc, on="Features", how="inner"
)  # Join by column while keeping only items that exist in both, select outer or left for other options
df_features = df_join["Features"]  # Save features from data frame
features = df_features.tolist()  # Convert to list

## Setup Predictors and RFE
df_rfe = df_nev[features]  # Add selected features to df
df_rfe["outcome"] = df_nev["outcome"]  # Add outcome to RFE df
df_rfe = df_rfe.dropna()  # Drop all columns with NA
X = df_rfe[df_features]  # Save features columns as predictor data frame
Y = df_rfe["outcome"]  # Use outcome data frame
Log_RFE = LogisticRegression(
    solver="liblinear",
    max_iter=4000)  # Use regression coefficient as estimator
selector = rfe(
    estimator=Log_RFE, step=1, min_features_to_select=1
)  # define selection parameters, in this case all features are selected. See Readme for more ifo

## Run Recursive Feature Selection
selected = selector.fit(X, Y)  # This will take time

## Output RFE results
ar_rfe = selected.support_  # Save Boolean values as numpy array
l_rfe = list(zip(X, ar_rfe))  # Create list of variables alongside RFE value
df_rfe = pd.DataFrame(l_rfe, columns=[
    "Features", "RFE"
])  # Create data frame of importances with variables and gini column names
df_rfe = df_rfe[df_rfe.RFE == True]  # Select Variables that were True
df_rfe = df_rfe.drop(columns=["RFE"])  # Drop Unwanted Columns

## Verify
#scale data
scaler = pp.StandardScaler()
scaler.fit(post2000_exp)
post2000_exp_scaled = pd.DataFrame(scaler.transform(post2000_exp),
                                   index=post2000_exp.index,
                                   columns=post2000_exp.columns)

#retain unscaled features
post2000_exp = pd.concat([numfeat_post, catfeat_post_bin], axis=1)
post2000_exp = post2000_exp[pre2000_exp.columns]

#IDENTIFY POTENTIAL FEATURES WITH RECURSIVE FEATURE SEARCH AND 10-FOLD CV------

#run recursive feature search with 10-fold cv to identify potential features
lr = lm.LogisticRegression()
lr_rfe_cv = rfe(estimator=lr, step=1, cv=10, scoring='roc_auc', verbose=1)
lr_rfe_cv.fit(pre2000_exp_scaled, pre2000_res)

#identify features
features = pre2000_exp_scaled.columns[lr_rfe_cv.get_support()]
print features

#run 10-fold CV to get scores with selected features (ROC_AUC = 0.9451)
lr_cv = cv(lr,
           pre2000_exp_scaled[features],
           pre2000_res,
           cv=10,
           scoring='roc_auc')
lr_cv.mean()

#create dataset with response and selected features
# BUILD DECISION TREE MODEL (SCALED DATA)---------------------------------------

# instantiate decision tree model with coeff
dt = TreeClassifierWithCoef(
    criterion="gini",
    splitter="best",
    max_features=None,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    max_leaf_nodes=None,
    random_state=1,
)

# conduct recursive feature search
dt_rfe_cv = rfe(estimator=dt, step=1, cv=10, scoring="roc_auc", verbose=1)
dt_rfe_cv.fit(pre2000_exp_scaled, pre2000_res)

# identify and plot optimal number of features (d = 17), ROC_AUC = 0.8262
print dt_rfe_cv.n_features_
print dt_rfe_cv.grid_scores_.max()

plt.figure()
plt.xlabel("DT: Number of Features selected")
plt.ylabel("DT: Cross Validation Score (ROC_AUC)")
plt.plot(range(1, len(dt_rfe_cv.grid_scores_) + 1), dt_rfe_cv.grid_scores_)
plt.show()

# identify selected features
dt_features = pre2000_exp_scaled.columns[dt_rfe_cv.get_support()]
print dt_features
scaler.fit(post2000_exp)
post2000_exp_scaled = pd.DataFrame(scaler.transform(post2000_exp), 
                                   index = post2000_exp.index, 
                                   columns = post2000_exp.columns)
                                   
#retain unscaled features
post2000_exp = pd.concat([numfeat_post, catfeat_post_bin],axis = 1)
post2000_exp = post2000_exp[pre2000_exp.columns]



#IDENTIFY POTENTIAL FEATURES WITH RECURSIVE FEATURE SEARCH AND 10-FOLD CV------

#run recursive feature search with 10-fold cv to identify potential features
lr = lm.LogisticRegression()
lr_rfe_cv = rfe(estimator=lr, step=1, cv=10, scoring='roc_auc', verbose = 1)
lr_rfe_cv.fit(pre2000_exp_scaled, pre2000_res)

#identify features
features = pre2000_exp_scaled.columns[lr_rfe_cv.get_support()]
print features

#run 10-fold CV to get scores with selected features (ROC_AUC = 0.9451)
lr_cv = cv(lr, pre2000_exp_scaled[features], pre2000_res, cv=10, scoring='roc_auc')
lr_cv.mean()

#create dataset with response and selected features
lrset = pd.concat([pre2000_exp_scaled[features], pre2000_res], axis=1)


Пример #7
0
                 axis=0)
    indices = np.argsort(importances)[::-1]

    print("Feature ranking:")
    proba = [0] * 5
    for f in range(x_train.shape[1]):
        #each line in proba correspond to one histone marker
        print("%d. Histone %d (%f)" %
              (f + 1, indices[f] % 5, importances[indices[f]]))
        proba[indices[f] % 5] += importances[indices[f]]
    print(proba)
    print(sum(proba))
    """-> Features selection with RFECV with our logistic regression classifier as estimator"""

    rfecv = rfe(estimator=XGBClassifier(n_estimators=650, max_depth=18),
                step=50,
                verbose=1,
                cv=10)
    rfecv.fit(x_train, y_train)
    rfecv_scores = rfecv.grid_scores_

    #now we are going to identify the number of features selected from each histone marker
    support = rfecv.support_

    nb_selected_features = 0
    histone_marker = []
    #histone marker H3K4me3
    for i in range(0, 500, 5):
        if (support[i] == True):
            nb_selected_features += 1
    histone_marker.append(['H3K4me3', nb_selected_features])
    #histone marker H3K4me1